From a7fe922b98bb45be7dce7c1cfe668ec27eeddc74 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Aug 17 2016 19:33:52 +0000
Subject: Vendor import of llvm release_39 branch r278877:

https://llvm.org/svn/llvm-project/llvm/branches/release_39@278877

---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f102424..0393150 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -293,6 +293,7 @@ endif()
 option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
 option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
 option(LLVM_ENABLE_LIBCXXABI "Use libc++abi when using libc++." OFF)
+option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
 option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
diff --git a/LICENSE.TXT b/LICENSE.TXT
index 8b1585d..555c8bb 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -61,8 +61,6 @@ licenses, and/or restrictions:
 
 Program             Directory
 -------             ---------
-Autoconf            llvm/autoconf
-                    llvm/projects/ModuleMaker/autoconf
 Google Test         llvm/utils/unittest/googletest
 OpenBSD regex       llvm/lib/Support/{reg*, COPYRIGHT.regex}
 pyyaml tests        llvm/test/YAMLParser/{*.data, LICENSE.TXT}
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 22b4408..a0a7995 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -144,6 +144,12 @@ function(add_flag_or_print_warning flag name)
   endif()
 endfunction()
 
+if(LLVM_ENABLE_LLD)
+  check_cxx_compiler_flag("-fuse-ld=lld" CXX_SUPPORTS_LLD)
+  append_if(CXX_SUPPORTS_LLD "-fuse-ld=lld"
+    CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+endif()
+
 if( LLVM_ENABLE_PIC )
   if( XCODE )
     # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 6a54343..2f5a27c 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -436,7 +436,7 @@ For example, consider this simple LLVM example:
 The X86 instruction selector might produce this machine code for the ``div`` and
 ``ret``:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ;; Start of div
   %EAX = mov %reg1024           ;; Copy X (in reg1024) into EAX
@@ -453,7 +453,7 @@ By the end of code generation, the register allocator would coalesce the
 registers and delete the resultant identity moves producing the following
 code:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ;; X is in EAX, Y is in ECX
   mov %EAX, %EDX
@@ -965,7 +965,7 @@ target code.  For example, consider the following LLVM fragment:
 
 This LLVM code corresponds to a SelectionDAG that looks basically like this:
 
-.. code-block:: llvm
+.. code-block:: text
 
   (fadd:f32 (fmul:f32 (fadd:f32 W, X), Y), Z)
 
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index a0ca1bf..413b6f4 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -144,7 +144,7 @@ exists anywhere in the file.
 The FileCheck -check-prefix option
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The FileCheck :option:`-check-prefix` option allows multiple test
+The FileCheck `-check-prefix` option allows multiple test
 configurations to be driven from one `.ll` file.  This is useful in many
 circumstances, for example, testing different architectural variants with
 :program:`llc`.  Here's a simple example:
@@ -303,7 +303,7 @@ be aware that the definition rule can match `after` its use.
 
 So, for instance, the code below will pass:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
   ; CHECK-DAG: vmov.32 [[REG2]][1]
@@ -312,7 +312,7 @@ So, for instance, the code below will pass:
 
 While this other code, will not:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
   ; CHECK-DAG: vmov.32 [[REG2]][1]
@@ -473,7 +473,7 @@ To match newline characters in regular expressions the character class
 
 matches output of the form (from llvm-dwarfdump):
 
-.. code-block:: llvm
+.. code-block:: text
 
        DW_AT_location [DW_FORM_sec_offset]   (0x00000233)
        DW_AT_name [DW_FORM_strp]  ( .debug_str[0x000000c9] = "intd")
diff --git a/docs/CommandGuide/llvm-nm.rst b/docs/CommandGuide/llvm-nm.rst
index f666e1c..319e6e6 100644
--- a/docs/CommandGuide/llvm-nm.rst
+++ b/docs/CommandGuide/llvm-nm.rst
@@ -68,11 +68,11 @@ OPTIONS
 
 .. option:: -B    (default)
 
- Use BSD output format.  Alias for :option:`--format=bsd`.
+ Use BSD output format.  Alias for `--format=bsd`.
 
 .. option:: -P
 
- Use POSIX.2 output format.  Alias for :option:`--format=posix`.
+ Use POSIX.2 output format.  Alias for `--format=posix`.
 
 .. option:: --debug-syms, -a
 
diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst
index 3a050f7..7b9255d 100644
--- a/docs/CommandGuide/opt.rst
+++ b/docs/CommandGuide/opt.rst
@@ -12,16 +12,16 @@ DESCRIPTION
 The :program:`opt` command is the modular LLVM optimizer and analyzer.  It
 takes LLVM source files as input, runs the specified optimizations or analyses
 on it, and then outputs the optimized file or the analysis results.  The
-function of :program:`opt` depends on whether the :option:`-analyze` option is
+function of :program:`opt` depends on whether the `-analyze` option is
 given.
 
-When :option:`-analyze` is specified, :program:`opt` performs various analyses
+When `-analyze` is specified, :program:`opt` performs various analyses
 of the input source.  It will usually print the results on standard output, but
 in a few cases, it will print output to standard error or generate a file with
 the analysis output, which is usually done when the output is meant for another
 program.
 
-While :option:`-analyze` is *not* given, :program:`opt` attempts to produce an
+While `-analyze` is *not* given, :program:`opt` attempts to produce an
 optimized output file.  The optimizations available via :program:`opt` depend
 upon what libraries were linked into it as well as any additional libraries
 that have been loaded with the :option:`-load` option.  Use the :option:`-help`
@@ -68,19 +68,19 @@ OPTIONS
 
 .. option:: -disable-opt
 
- This option is only meaningful when :option:`-std-link-opts` is given.  It
+ This option is only meaningful when `-std-link-opts` is given.  It
  disables most passes.
 
 .. option:: -strip-debug
 
  This option causes opt to strip debug information from the module before
- applying other optimizations.  It is essentially the same as :option:`-strip`
+ applying other optimizations.  It is essentially the same as `-strip`
  but it ensures that stripping of debug information is done first.
 
 .. option:: -verify-each
 
  This option causes opt to add a verify pass after every pass otherwise
- specified on the command line (including :option:`-verify`).  This is useful
+ specified on the command line (including `-verify`).  This is useful
  for cases where it is suspected that a pass is creating an invalid module but
  it is not clear which pass is doing it.
 
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
index 41dd4b6..a44fb92 100644
--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@@ -406,7 +406,7 @@ outlined.  After the handler is outlined, this intrinsic is simply removed.
 ``llvm.eh.exceptionpointer``
 ----------------------------
 
-.. code-block:: llvm
+.. code-block:: text
 
   i8 addrspace(N)* @llvm.eh.padparam.pNi8(token %catchpad)
 
@@ -427,7 +427,7 @@ backend.  Uses of them are generated by the backend's
 ``llvm.eh.sjlj.setjmp``
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-.. code-block:: llvm
+.. code-block:: text
 
   i32 @llvm.eh.sjlj.setjmp(i8* %setjmp_buf)
 
@@ -664,7 +664,7 @@ all of the new IR instructions:
     return 0;
   }
 
-.. code-block:: llvm
+.. code-block:: text
 
   define i32 @f() nounwind personality i32 (...)* @__CxxFrameHandler3 {
   entry:
@@ -741,7 +741,7 @@ C++ code:
     }
   }
 
-.. code-block:: llvm
+.. code-block:: text
 
   define void @f() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
   entry:
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index c8ff07c..f702921 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -43,7 +43,7 @@ The following additional relocation types are supported:
 corresponds to the COFF relocation types ``IMAGE_REL_I386_DIR32NB`` (32-bit) or
 ``IMAGE_REL_AMD64_ADDR32NB`` (64-bit).
 
-.. code-block:: gas
+.. code-block:: text
 
   .text
   fun:
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index 56b4b9f..81605bc 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -204,7 +204,7 @@ IR features is specified by the selected :ref:`GC strategy description
 Specifying GC code generation: ``gc "..."``
 -------------------------------------------
 
-.. code-block:: llvm
+.. code-block:: text
 
   define <returntype> @name(...) gc "name" { ... }
 
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
index c9cfae6..f39f1d9 100644
--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@@ -105,7 +105,7 @@ memory, or a global variable.
 
 To make this clear, let's consider a more obtuse example:
 
-.. code-block:: llvm
+.. code-block:: text
 
   %MyVar = uninitialized global i32
   ...
@@ -142,7 +142,7 @@ Quick answer: there are no superfluous indices.
 This question arises most often when the GEP instruction is applied to a global
 variable which is always a pointer type. For example, consider this:
 
-.. code-block:: llvm
+.. code-block:: text
 
   %MyStruct = uninitialized global { float*, i32 }
   ...
@@ -178,7 +178,7 @@ The GetElementPtr instruction dereferences nothing. That is, it doesn't access
 memory in any way. That's what the Load and Store instructions are for.  GEP is
 only involved in the computation of addresses. For example, consider this:
 
-.. code-block:: llvm
+.. code-block:: text
 
   %MyVar = uninitialized global { [40 x i32 ]* }
   ...
@@ -195,7 +195,7 @@ illegal.
 In order to access the 18th integer in the array, you would need to do the
 following:
 
-.. code-block:: llvm
+.. code-block:: text
 
   %idx = getelementptr { [40 x i32]* }, { [40 x i32]* }* %, i64 0, i32 0
   %arr = load [40 x i32]** %idx
@@ -204,7 +204,7 @@ following:
 In this case, we have to load the pointer in the structure with a load
 instruction before we can index into the array. If the example was changed to:
 
-.. code-block:: llvm
+.. code-block:: text
 
   %MyVar = uninitialized global { [40 x i32 ] }
   ...
diff --git a/docs/HowToUseInstrMappings.rst b/docs/HowToUseInstrMappings.rst
index 8a3e7c8..1c586b4 100755
--- a/docs/HowToUseInstrMappings.rst
+++ b/docs/HowToUseInstrMappings.rst
@@ -30,7 +30,7 @@ instructions with each other. These tables are emitted in the
 ``XXXInstrInfo.inc`` file along with the functions to query them. Following
 is the definition of ``InstrMapping`` class definied in Target.td file:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class InstrMapping {
     // Used to reduce search space only to the instructions using this
@@ -69,7 +69,7 @@ non-predicated form by assigning appropriate values to the ``InstrMapping``
 fields. For this relationship, non-predicated instructions are treated as key
 instruction since they are the one used to query the interface function.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def getPredOpcode : InstrMapping {
     // Choose a FilterClass that is used as a base class for all the
@@ -116,7 +116,7 @@ to include relevant information in its definition. For example, consider
 following to be the current definitions of ADD, ADD_pt (true) and ADD_pf (false)
 instructions:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ADD : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
               "$dst = add($a, $b)",
@@ -137,7 +137,7 @@ In this step, we modify these instructions to include the information
 required by the relationship model, <tt>getPredOpcode</tt>, so that they can
 be related.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ADD : PredRel, ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
               "$dst = add($a, $b)",
diff --git a/docs/InAlloca.rst b/docs/InAlloca.rst
index c7609cd..a75f22d 100644
--- a/docs/InAlloca.rst
+++ b/docs/InAlloca.rst
@@ -41,7 +41,7 @@ that passes two default-constructed ``Foo`` objects to ``g`` in the
       g(Foo(), Foo());
     }
 
-.. code-block:: llvm
+.. code-block:: text
 
     %struct.Foo = type { i32, i32 }
     declare void @Foo_ctor(%struct.Foo* %this)
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index f6dda59..ce15c47 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -839,7 +839,7 @@ Note that the Mach-O platform doesn't support COMDATs and ELF only supports
 Here is an example of a COMDAT group where a function will only be selected if
 the COMDAT key's section is the largest:
 
-.. code-block:: llvm
+.. code-block:: text
 
    $foo = comdat largest
    @foo = global i32 2, comdat($foo)
@@ -851,7 +851,7 @@ the COMDAT key's section is the largest:
 As a syntactic sugar the ``$name`` can be omitted if the name is the same as
 the global name:
 
-.. code-block:: llvm
+.. code-block:: text
 
   $foo = comdat any
   @foo = global i32 2, comdat
@@ -875,7 +875,7 @@ if a collision occurs in the symbol table.
 The combined use of COMDATS and section attributes may yield surprising results.
 For example:
 
-.. code-block:: llvm
+.. code-block:: text
 
    $foo = comdat any
    $bar = comdat any
@@ -1205,7 +1205,7 @@ makes the format of the prologue data highly target dependent.
 A trivial example of valid prologue data for the x86 architecture is ``i8 144``,
 which encodes the ``nop`` instruction:
 
-.. code-block:: llvm
+.. code-block:: text
 
     define void @f() prologue i8 144 { ... }
 
@@ -1213,7 +1213,7 @@ Generally prologue data can be formed by encoding a relative branch instruction
 which skips the metadata, as in this example of valid prologue data for the
 x86_64 architecture, where the first two bytes encode ``jmp .+10``:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %0 = type <{ i8, i8, i8* }>
 
@@ -2237,7 +2237,7 @@ source file name to the local function name.
 
 The syntax for the source file name is simply:
 
-.. code-block:: llvm
+.. code-block:: text
 
     source_filename = "/path/to/source.c"
 
@@ -2847,7 +2847,7 @@ cleared low bit. However, in the ``%C`` example, the optimizer is
 allowed to assume that the '``undef``' operand could be the same as
 ``%Y``, allowing the whole '``select``' to be eliminated.
 
-.. code-block:: llvm
+.. code-block:: text
 
       %A = xor undef, undef
 
@@ -2899,7 +2899,7 @@ does not execute at all. This allows us to delete the divide and all
 code after it. Because the undefined operation "can't happen", the
 optimizer can assume that it occurs in dead code.
 
-.. code-block:: llvm
+.. code-block:: text
 
     a:  store undef -> %X
     b:  store %X -> undef
@@ -3884,7 +3884,7 @@ their operand. For example:
 
 Metadata nodes that aren't uniqued use the ``distinct`` keyword. For example:
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = distinct !{!"test\00", i32 10}
 
@@ -3949,7 +3949,7 @@ fields are tuples containing the debug info to be emitted along with the compile
 unit, regardless of code optimizations (some nodes are only emitted if there are
 references to them from instructions).
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
                         isOptimized: true, flags: "-O2", runtimeVersion: 2,
@@ -3985,7 +3985,7 @@ DIBasicType
 ``DIBasicType`` nodes represent primitive types, such as ``int``, ``bool`` and
 ``float``. ``tag:`` defaults to ``DW_TAG_base_type``.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
                       encoding: DW_ATE_unsigned_char)
@@ -3994,7 +3994,7 @@ DIBasicType
 The ``encoding:`` describes the details of the type. Usually it's one of the
 following:
 
-.. code-block:: llvm
+.. code-block:: text
 
   DW_ATE_address       = 1
   DW_ATE_boolean       = 2
@@ -4014,7 +4014,7 @@ refers to a tuple; the first operand is the return type, while the rest are the
 types of the formal arguments in order. If the first operand is ``null``, that
 represents a function with no return value (such as ``void foo() {}`` in C++).
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !BasicType(name: "int", size: 32, align: 32, DW_ATE_signed)
     !1 = !BasicType(name: "char", size: 8, align: 8, DW_ATE_signed_char)
@@ -4028,7 +4028,7 @@ DIDerivedType
 ``DIDerivedType`` nodes represent types derived from other types, such as
 qualified types.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
                       encoding: DW_ATE_unsigned_char)
@@ -4037,7 +4037,7 @@ qualified types.
 
 The following ``tag:`` values are valid:
 
-.. code-block:: llvm
+.. code-block:: text
 
   DW_TAG_member             = 13
   DW_TAG_pointer_type       = 15
@@ -4089,7 +4089,7 @@ does not have  ``flags: DIFlagFwdDecl`` set.  LLVM tools that link modules
 together will unique such definitions at parse time via the ``identifier:``
 field, even if the nodes are ``distinct``.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DIEnumerator(name: "SixKind", value: 7)
     !1 = !DIEnumerator(name: "SevenKind", value: 7)
@@ -4100,7 +4100,7 @@ field, even if the nodes are ``distinct``.
 
 The following ``tag:`` values are valid:
 
-.. code-block:: llvm
+.. code-block:: text
 
   DW_TAG_array_type       = 1
   DW_TAG_class_type       = 2
@@ -4219,7 +4219,7 @@ type with an ODR ``identifier:`` and that does not set ``flags: DIFwdDecl``,
 then the subprogram declaration is uniqued based only on its ``linkageName:``
 and ``scope:``.
 
-.. code-block:: llvm
+.. code-block:: text
 
     define void @_Z3foov() !dbg !0 {
       ...
@@ -4244,7 +4244,7 @@ DILexicalBlock
 two lexical blocks at same depth. They are valid targets for ``scope:``
 fields.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = distinct !DILexicalBlock(scope: !1, file: !2, line: 7, column: 35)
 
@@ -4290,7 +4290,7 @@ the ``arg:`` field is set to non-zero, then this variable is a subprogram
 parameter, and it will be included in the ``variables:`` field of its
 :ref:`DISubprogram`.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DILocalVariable(name: "this", arg: 1, scope: !3, file: !2, line: 7,
                           type: !3, flags: DIFlagArtificial)
@@ -4313,7 +4313,7 @@ The current supported vocabulary is limited:
 - ``DW_OP_bit_piece, 16, 8`` specifies the offset and size (``16`` and ``8``
   here, respectively) of the variable piece from the working expression.
 
-.. code-block:: llvm
+.. code-block:: text
 
     !0 = !DIExpression(DW_OP_deref)
     !1 = !DIExpression(DW_OP_plus, 3)
@@ -4336,7 +4336,7 @@ DIImportedEntity
 ``DIImportedEntity`` nodes represent entities (such as modules) imported into a
 compile unit.
 
-.. code-block:: llvm
+.. code-block:: text
 
    !2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
                           entity: !1, line: 7)
@@ -4349,7 +4349,7 @@ The ``name:`` field is the macro identifier, followed by macro parameters when
 defining a function-like macro, and the ``value`` field is the token-string
 used to expand the macro identifier.
 
-.. code-block:: llvm
+.. code-block:: text
 
    !2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)",
                  value: "((x) + 1)")
@@ -4362,7 +4362,7 @@ DIMacroFile
 The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that
 appear in the included source file.
 
-.. code-block:: llvm
+.. code-block:: text
 
    !2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2,
                      nodes: !3)
@@ -5660,7 +5660,7 @@ block. Therefore, it must be the only non-phi instruction in the block.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
     dispatch1:
       %cs1 = catchswitch within none [label %handler0, label %handler1] unwind to caller
@@ -5711,7 +5711,7 @@ the ``catchret``'s behavior is undefined.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       catchret from %catch label %continue
 
@@ -5761,7 +5761,7 @@ It transfers control to ``continue`` or unwinds out of the function.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       cleanupret from %cleanup unwind to caller
       cleanupret from %cleanup unwind label %continue
@@ -5851,7 +5851,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = add i32 4, %var          ; yields i32:result = 4 + %var
 
@@ -5890,7 +5890,7 @@ optimizations:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = fadd float 4.0, %var          ; yields float:result = 4.0 + %var
 
@@ -5942,7 +5942,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = sub i32 4, %var          ; yields i32:result = 4 - %var
       <result> = sub i32 0, %val          ; yields i32:result = -%var
@@ -5985,7 +5985,7 @@ unsafe floating point optimizations:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = fsub float 4.0, %var           ; yields float:result = 4.0 - %var
       <result> = fsub float -0.0, %val          ; yields float:result = -%var
@@ -6039,7 +6039,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = mul i32 4, %var          ; yields i32:result = 4 * %var
 
@@ -6078,7 +6078,7 @@ unsafe floating point optimizations:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = fmul float 4.0, %var          ; yields float:result = 4.0 * %var
 
@@ -6122,7 +6122,7 @@ such, "((a udiv exact b) mul b) == a").
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = udiv i32 4, %var          ; yields i32:result = 4 / %var
 
@@ -6168,7 +6168,7 @@ a :ref:`poison value <poisonvalues>` if the result would be rounded.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = sdiv i32 4, %var          ; yields i32:result = 4 / %var
 
@@ -6207,7 +6207,7 @@ unsafe floating point optimizations:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = fdiv float 4.0, %var          ; yields float:result = 4.0 / %var
 
@@ -6249,7 +6249,7 @@ Taking the remainder of a division by zero leads to undefined behavior.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = urem i32 4, %var          ; yields i32:result = 4 % %var
 
@@ -6304,7 +6304,7 @@ result of the division and the remainder.)
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = srem i32 4, %var          ; yields i32:result = 4 % %var
 
@@ -6344,7 +6344,7 @@ to enable otherwise unsafe floating point optimizations:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = frem float 4.0, %var          ; yields float:result = 4.0 % %var
 
@@ -6406,7 +6406,7 @@ nsw/nuw bits in (mul %op1, (shl 1, %op2)).
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = shl i32 4, %var   ; yields i32: 4 << %var
       <result> = shl i32 4, 2      ; yields i32: 16
@@ -6455,7 +6455,7 @@ non-zero.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = lshr i32 4, 1   ; yields i32:result = 2
       <result> = lshr i32 4, 2   ; yields i32:result = 1
@@ -6506,7 +6506,7 @@ non-zero.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = ashr i32 4, 1   ; yields i32:result = 2
       <result> = ashr i32 4, 2   ; yields i32:result = 1
@@ -6558,7 +6558,7 @@ The truth table used for the '``and``' instruction is:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = and i32 4, %var         ; yields i32:result = 4 & %var
       <result> = and i32 15, 40          ; yields i32:result = 8
@@ -6657,7 +6657,7 @@ The truth table used for the '``xor``' instruction is:
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = xor i32 4, %var         ; yields i32:result = 4 ^ %var
       <result> = xor i32 15, 40          ; yields i32:result = 39
@@ -6710,7 +6710,7 @@ exceeds the length of ``val``, the results are undefined.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = extractelement <4 x i32> %vec, i32 0    ; yields i32
 
@@ -6752,7 +6752,7 @@ undefined.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = insertelement <4 x i32> %vec, i32 1, i32 0    ; yields <4 x i32>
 
@@ -6800,7 +6800,7 @@ only one vector.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = shufflevector <4 x i32> %v1, <4 x i32> %v2,
                               <4 x i32> <i32 0, i32 4, i32 1, i32 5>  ; yields <4 x i32>
@@ -6859,7 +6859,7 @@ the index operands.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = extractvalue {i32, float} %agg, 0    ; yields i32
 
@@ -8126,7 +8126,7 @@ or :ref:`ptrtoint <i_ptrtoint>` instructions first.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       %X = bitcast i8 255 to i8              ; yields i8 :-1
       %Y = bitcast i32* %x to sint*          ; yields sint*:%x
@@ -8265,7 +8265,7 @@ as the values being compared. Otherwise, the result is an ``i1``.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = icmp eq i32 4, 5          ; yields: result=false
       <result> = icmp ne float* %X, %X     ; yields: result=false
@@ -8379,7 +8379,7 @@ assumptions to be made about the values of input arguments; namely
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       <result> = fcmp oeq float 4.0, 5.0    ; yields: result=false
       <result> = fcmp one float 4.0, 5.0    ; yields: result=true
@@ -8815,7 +8815,7 @@ that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
     dispatch:
       %cs = catchswitch within none [label %handler0] unwind to caller
@@ -8885,7 +8885,7 @@ that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.
 Example:
 """"""""
 
-.. code-block:: llvm
+.. code-block:: text
 
       %tok = cleanuppad within %cs []
 
@@ -12481,19 +12481,19 @@ optimistic assumptions made during compilation.  The semantics of
 ``@llvm.experimental.deoptimize`` -- its body is defined to be
 equivalent to:
 
-.. code-block:: llvm
+.. code-block:: text
 
-	define void @llvm.experimental.guard(i1 %pred, <args...>) {
-	  %realPred = and i1 %pred, undef
-	  br i1 %realPred, label %continue, label %leave [, !make.implicit !{}]
+  define void @llvm.experimental.guard(i1 %pred, <args...>) {
+    %realPred = and i1 %pred, undef
+    br i1 %realPred, label %continue, label %leave [, !make.implicit !{}]
 
-	leave:
-	  call void @llvm.experimental.deoptimize(<args...>) [ "deopt"() ]
-	  ret void
+  leave:
+    call void @llvm.experimental.deoptimize(<args...>) [ "deopt"() ]
+    ret void
 
-	continue:
-	  ret void
-	}
+  continue:
+    ret void
+  }
 
 
 with the optional ``[, !make.implicit !{}]`` present if and only if it
diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst
index a5f8c8c..f6ee6cc 100644
--- a/docs/MIRLangRef.rst
+++ b/docs/MIRLangRef.rst
@@ -111,7 +111,6 @@ Here is an example of a YAML document that contains an LLVM module:
 
 .. code-block:: llvm
 
-     --- |
        define i32 @inc(i32* %x) {
        entry:
          %0 = load i32, i32* %x
@@ -119,7 +118,6 @@ Here is an example of a YAML document that contains an LLVM module:
          store i32 %1, i32* %x
          ret i32 %1
        }
-     ...
 
 .. _YAML block literal string: http://www.yaml.org/spec/1.2/spec.html#id2795688
 
@@ -129,7 +127,7 @@ Machine Functions
 The remaining YAML documents contain the machine functions. This is an example
 of such YAML document:
 
-.. code-block:: llvm
+.. code-block:: text
 
      ---
      name:            inc
@@ -172,7 +170,7 @@ A machine basic block is defined in a single block definition source construct
 that contains the block's ID.
 The example below defines two blocks that have an ID of zero and one:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0:
       <instructions>
@@ -182,7 +180,7 @@ The example below defines two blocks that have an ID of zero and one:
 A machine basic block can also have a name. It should be specified after the ID
 in the block's definition:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0.entry:       ; This block's name is "entry"
        <instructions>
@@ -196,7 +194,7 @@ Block References
 The machine basic blocks are identified by their ID numbers. Individual
 blocks are referenced using the following syntax:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %bb.<id>[.<name>]
 
@@ -213,7 +211,7 @@ Successors
 The machine basic block's successors have to be specified before any of the
 instructions:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0.entry:
       successors: %bb.1.then, %bb.2.else
@@ -227,7 +225,7 @@ The branch weights can be specified in brackets after the successor blocks.
 The example below defines a block that has two successors with branch weights
 of 32 and 16:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0.entry:
       successors: %bb.1.then(32), %bb.2.else(16)
@@ -240,7 +238,7 @@ Live In Registers
 The machine basic block's live in registers have to be specified before any of
 the instructions:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0.entry:
       liveins: %edi, %esi
@@ -255,7 +253,7 @@ Miscellaneous Attributes
 The attributes ``IsAddressTaken``, ``IsLandingPad`` and ``Alignment`` can be
 specified in brackets after the block's definition:
 
-.. code-block:: llvm
+.. code-block:: text
 
     bb.0.entry (address-taken):
       <instructions>
@@ -278,7 +276,7 @@ The instruction's name is usually specified before the operands. The example
 below shows an instance of the X86 ``RETQ`` instruction with a single machine
 operand:
 
-.. code-block:: llvm
+.. code-block:: text
 
     RETQ %eax
 
@@ -287,7 +285,7 @@ operands, the instruction's name has to be specified after them. The example
 below shows an instance of the AArch64 ``LDPXpost`` instruction with three
 defined register operands:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %sp, %fp, %lr = LDPXpost %sp, 2
 
@@ -303,7 +301,7 @@ Instruction Flags
 
 The flag ``frame-setup`` can be specified before the instruction's name:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %fp = frame-setup ADDXri %sp, 0, 0
 
@@ -321,13 +319,13 @@ but they can also be used in a number of other places, like the
 The physical registers are identified by their name. They use the following
 syntax:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %<name>
 
 The example below shows three X86 physical registers:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %eax
     %r15
@@ -336,13 +334,13 @@ The example below shows three X86 physical registers:
 The virtual registers are identified by their ID number. They use the following
 syntax:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %<id>
 
 Example:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %0
 
@@ -366,7 +364,7 @@ The immediate machine operands are untyped, 64-bit signed integers. The
 example below shows an instance of the X86 ``MOV32ri`` instruction that has an
 immediate machine operand ``-42``:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %eax = MOV32ri -42
 
@@ -384,14 +382,14 @@ machine operands. The register operands can also have optional
 and a reference to the tied register operand.
 The full syntax of a register operand is shown below:
 
-.. code-block:: llvm
+.. code-block:: text
 
     [<flags>] <register> [ :<subregister-idx-name> ] [ (tied-def <tied-op>) ]
 
 This example shows an instance of the X86 ``XOR32rr`` instruction that has
 5 register operands with different register flags:
 
-.. code-block:: llvm
+.. code-block:: text
 
   dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al
 
@@ -446,7 +444,7 @@ the subregister indices. The example below shows an instance of the ``COPY``
 pseudo instruction that uses the X86 ``sub_8bit`` subregister index to copy 8
 lower bits from the 32-bit virtual register 0 to the 8-bit virtual register 1:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %1 = COPY %0:sub_8bit
 
@@ -461,7 +459,7 @@ The global value machine operands reference the global values from the
 The example below shows an instance of the X86 ``MOV64rm`` instruction that has
 a global value operand named ``G``:
 
-.. code-block:: llvm
+.. code-block:: text
 
     %rax = MOV64rm %rip, 1, _, @G, _
 
diff --git a/docs/MarkedUpDisassembly.rst b/docs/MarkedUpDisassembly.rst
index cc4dbc8..df8befe 100644
--- a/docs/MarkedUpDisassembly.rst
+++ b/docs/MarkedUpDisassembly.rst
@@ -70,7 +70,7 @@ clients.
 For example, a possible annotation of an ARM load of a stack-relative location
 might be annotated as:
 
-.. code-block:: nasm
+.. code-block:: text
 
    ldr <reg gpr:r0>, <mem regoffset:[<reg gpr:sp>, <imm:#4>]>
 
diff --git a/docs/MergeFunctions.rst b/docs/MergeFunctions.rst
index f808010..b87cea6 100644
--- a/docs/MergeFunctions.rst
+++ b/docs/MergeFunctions.rst
@@ -394,7 +394,7 @@ and in right function "*FR*". And every part of *left* place is equal to the
 corresponding part of *right* place, and (!) both parts use *Value* instances,
 for example:
 
-.. code-block:: llvm
+.. code-block:: text
 
    instr0 i32 %LV   ; left side, function FL
    instr0 i32 %RV   ; right side, function FR
@@ -409,13 +409,13 @@ in "*FL*" and "*FR*".
 
 Consider small example here:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define void %f(i32 %pf0, i32 %pf1) {
     instr0 i32 %pf0 instr1 i32 %pf1 instr2 i32 123
   }
 
-.. code-block:: llvm
+.. code-block:: text
 
   define void %g(i32 %pg0, i32 %pg1) {
     instr0 i32 %pg0 instr1 i32 %pg0 instr2 i32 123
diff --git a/docs/NVPTXUsage.rst b/docs/NVPTXUsage.rst
index 8b8c40f..fdfc8e4 100644
--- a/docs/NVPTXUsage.rst
+++ b/docs/NVPTXUsage.rst
@@ -37,7 +37,7 @@ code. By default, the back-end will emit device functions. Metadata is used to
 declare a function as a kernel function. This metadata is attached to the
 ``nvvm.annotations`` named metadata object, and has the following format:
 
-.. code-block:: llvm
+.. code-block:: text
 
    !0 = !{<function-ref>, metadata !"kernel", i32 1}
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 54f2d53..dc76617 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -40,7 +40,10 @@ Non-comprehensive list of changes in this release
 
 * There is no longer a "global context" available in LLVM, except for the C API.
 
-* .. note about autoconf build having been removed.
+* The autoconf build system has been removed in favor of CMake. LLVM 3.9
+  requires CMake 3.4.3 or later to build. For information about using CMake
+  please see the documentation on :doc:`CMake`. For information about the CMake
+  language there is also a :doc:`CMakePrimer` document available.
 
 * .. note about C API functions LLVMParseBitcode,
    LLVMParseBitcodeInContext, LLVMGetBitcodeModuleInContext and
@@ -69,11 +72,13 @@ Non-comprehensive list of changes in this release
   need to be updated to replace the argument node and remove any dead nodes in
   cases where they currently return an ``SDNode *`` from this interface.
 
-* Introduction of ThinLTO: [FIXME: needs to be documented more extensively in
-  /docs/ ; ping Mehdi/Teresa before the release if not done]
-
 * Raised the minimum required CMake version to 3.4.3.
 
+* Added the MemorySSA analysis, which hopes to replace MemoryDependenceAnalysis.
+  It should provide higher-quality results than MemDep, and be algorithmically
+  faster than MemDep. Currently, GVNHoist (which is off by default) makes use of
+  MemorySSA.
+
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
@@ -93,6 +98,32 @@ Non-comprehensive list of changes in this release
 
    Makes programs 10x faster by doing Special New Thing.
 
+GCC ABI Tag
+-----------
+
+Recently, many of the Linux distributions (ex. `Fedora <http://developerblog.redhat.com/2015/02/10/gcc-5-in-fedora/>`_,
+`Debian <https://wiki.debian.org/GCC5>`_, `Ubuntu <https://wiki.ubuntu.com/GCC5>`_)
+have moved on to use the new `GCC ABI <https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Attributes.html>`_
+to work around `C++11 incompatibilities in libstdc++ <https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html>`_.
+This caused `incompatibility problems <https://gcc.gnu.org/ml/gcc-patches/2015-04/msg00153.html>`_
+with other compilers (ex. Clang), which needed to be fixed, but due to the
+experimental nature of GCC's own implementation, it took a long time for it to
+land in LLVM (`here <https://reviews.llvm.org/D18035>`_ and
+`here <https://reviews.llvm.org/D17567>`_), not in time for the 3.8 release.
+
+Those patches are now present in the 3.9.0 release and should be working on the
+majority of cases, as they have been tested thoroughly. However, some bugs were
+`filled in GCC <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71712>`_ and have not
+yet been fixed, so there may be corner cases not covered by either GCC or Clang.
+Bug fixes to those problems should be reported in Bugzilla (either LLVM or GCC),
+and patches to LLVM's trunk are very likely to be back-ported to future 3.9.x
+releases (depends on how destructive it is).
+
+Unfortunately, these patches won't be back-ported to 3.8.x or earlier, so we
+strongly recommend people to use 3.9.x when GCC ABI cases are at stake.
+
+For a more in-depth view of the issue, check our `Bugzilla entry <https://llvm.org/bugs/show_bug.cgi?id=23529>`_.
+
 Changes to the LLVM IR
 ----------------------
 
@@ -110,16 +141,98 @@ link-time may be differently optimized than the one what was visible
 during optimization, and may have arbitrarily different observable
 behavior.  See `PR26774 <http://llvm.org/PR26774>`_ for more details.
 
-Changes to the ARM Backend
+Support for ThinLTO
+-------------------
+
+LLVM now supports ThinLTO compilation, which can be invoked by compiling
+and linking with -flto=thin. The gold linker plugin, as well as linkers
+that use the new ThinLTO API in libLTO (like ld64), will transparently
+execute the ThinLTO backends in parallel threads.
+For more information on ThinLTO and the LLVM implementation, see the
+`ThinLTO blog post <http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html>`_.
+
+Changes to the ARM Targets
 --------------------------
 
- During this release ...
+**During this release the AArch64 backend has:**
+
+* Gained support for Qualcomm's Kryo and Broadcom's Vulcan CPUs, including
+  scheduling models.
+* Landed a scheduling model for Samsung's Exynos M1.
+* Seen a lot of work on GlobalISel.
+* Learned a few more useful combines (fadd and fmul into fmadd, adjustments to the
+  stack pointer for callee-save stack memory and local stack memory etc).
+* Gained support for the Swift calling convention.
+* Switched to using SubtargetFeatures rather than testing for specific CPUs and
+  to using TableGen for handling system instruction operands.
+* Like ARM, AArch64 is now using the TargetParser, so no more StringSwitches
+  matching CPU, FPU or feature names will be accepted in normal code.
+* Clang can now self-host itself using LLD on AArch64.
+* Gained a big batch of tests from Halide.
+
+ Furthermore, LLDB now supports AArch64 compact unwind tables, as used on iOS,
+ tvos and watchos.
+
+**During this release the ARM target has:**
+
+* ARMv8.2-A can now be targeted directly via Clang flags.
+* Adding preliminary support for Cortex-R8.
+* LLDB can now parse EABI attributes for an ELF input.
+* Initial ARM/Thumb support was added to LLD.
+* The ExecutionEngine now supports COFF/ARM.
+* Swift calling convention was ported to ARM.
+* A large number of codegen fixes around ARMv8, DSP, correct sub-target support,
+  relocations, EABI, EHABI, Windows on ARM, atomics..
+* Improved assembler support for Linux/Android/Chromium sub-projects.
+* Initial support for MUSL (libc) on ARM.
+* Support for Thumb1 targets in libunwind.
+* Gained a big batch of tests from Halide.
 
 
 Changes to the MIPS Target
 --------------------------
 
- During this release ...
+**During this release the MIPS target has:**
+
+* Enabled the Integrated Assembler by default for all ``mips-*`` and
+  ``mipsel-*`` triples.
+* Significantly improved the Integrated Assembler support for the n64 ABI.
+* Added the Clang frontend ``-mcompact-branches={never,optimal,always}`` option
+  that controls how LLVM generates compact branches for MIPS targets.
+* Improved performance and code size for stack pointer adjustments in functions
+  with large frames.
+* Implemented many instructions from the microMIPS32R6 ISA and added CodeGen
+  support for most of them.
+* Added support for the triple used by Debian Stretch for little endian
+  MIPS64, ie. ``mips64el-linux-gnuabi64``.
+* Removed EABI which was neither tested nor properly supported.
+* Gained the ability to self-host on MIPS32R6.
+* Gained the ability to self-host on MIPS64R2 and MIPS64R6 when using the n64
+  ABI.
+* Added support for the ``LA`` macro in PIC mode for o32.
+* Added support for safestack in compiler-rt.
+* Added support for the MIPS n64 ABI in LLD.
+* Added LLD support for TLS relocations for both o32 and n64 MIPS ABIs.
+
+**The MIPS target has also fixed various bugs including the following notable
+fixes:**
+
+* Delay slots are no longer filled multiple times when either ``-save-temps``
+  or ``-via-file-asm`` are used.
+* Updated n32 and n64 to follow the standard ELF conventions for label prefixes
+  (``.L``), whereas o32 still uses its own (``$``).
+* Properly sign-extend values to GPR width for instructions that expect 32-bit
+  values on 64-bit ISAs.
+* Several fixes for the delay-slot filler pass, including correct
+  forbidden-slot hazard handling.
+* Fixed several errors caught by the machine verifier when turned on for MIPS.
+* Fixed broken predicate for ``SELECT`` patterns in MIPS64.
+* Fixed wrong truncation of memory address for ``LL``/``SC`` seqeuences in
+  MIPS64.
+* Fixed the o32, n32 and n64 handling of ``.cprestore`` directives when inside
+  a ``.set noat`` region by the Integrated Assembler.
+* Fixed the ordering of ``HI``/``LO`` pairs in the relocation table.
+* Fixed the generated ELF ``EFlags`` when Octeon is the target.
 
 
 Changes to the PowerPC Target
@@ -140,9 +253,16 @@ Changes to the X86 Target
   extensions using ``-march=knl``. The switch enables the ISA extensions
   AVX-512{F, CD, ER, PF}.
 
+* LLVM will now prefer ``PUSH`` instructions rather than ``%esp``-relative
+  ``MOV`` instructions for function calls at all optimization levels greater
+  than ``-O0``. Previously this transformation only occurred at ``-Os``.
+
 Changes to the AMDGPU Target
 -----------------------------
 
+ * Added backend support for OpenGL shader image, buffer storage, atomic
+   counter, and compute shader extensions (supported since Mesa 12)
+
  * Mesa 11.0.x is no longer supported
 
 
@@ -167,6 +287,21 @@ projects that have already been updated to work with LLVM 3.9.
 
 * A project
 
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
+
 
 Additional Information
 ======================
diff --git a/docs/SegmentedStacks.rst b/docs/SegmentedStacks.rst
index c0bf32b..b1c588c 100644
--- a/docs/SegmentedStacks.rst
+++ b/docs/SegmentedStacks.rst
@@ -33,7 +33,7 @@ current stack limit (minus the amount of space needed to allocate a new block) -
 this slot's offset is again dictated by ``libgcc``. The generated
 assembly looks like this on x86-64:
 
-.. code-block:: nasm
+.. code-block:: text
 
     leaq     -8(%rsp), %r10
     cmpq     %fs:112,  %r10
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 1815ee3..8c3142e 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -230,7 +230,7 @@ following C fragment, for example:
 
 Compiled to LLVM, this function would be represented like this:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ; Function Attrs: nounwind ssp uwtable
   define void @foo() #0 !dbg !4 {
@@ -303,7 +303,7 @@ The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the
 variable ``X``.  The metadata ``!dbg !14`` attached to the intrinsic provides
 scope information for the variable ``X``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   !14 = !DILocation(line: 2, column: 9, scope: !4)
   !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5,
@@ -327,7 +327,7 @@ The third intrinsic ``%llvm.dbg.declare`` encodes debugging information for
 variable ``Z``.  The metadata ``!dbg !19`` attached to the intrinsic provides
 scope information for the variable ``Z``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
   !19 = !DILocation(line: 5, column: 11, scope: !18)
@@ -390,7 +390,7 @@ Given an integer global variable declared as follows:
 
 a C/C++ front-end would generate the following descriptors:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ;;
   ;; Define the global itself.
@@ -456,7 +456,7 @@ Given a function declared as follows:
 
 a C/C++ front-end would generate the following descriptors:
 
-.. code-block:: llvm
+.. code-block:: text
 
   ;;
   ;; Define the anchor for subprograms.
diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst
index a78ab3c..29b1be3 100644
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@@ -138,7 +138,7 @@ SSA value ``%obj.relocated`` which represents the potentially changed value of
 ``%obj`` after the safepoint and update any following uses appropriately.  The 
 resulting relocation sequence is:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
          gc "statepoint-example" {
@@ -237,7 +237,7 @@ afterwards.
 If we extend our previous example to include a pointless derived pointer, 
 we get:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
          gc "statepoint-example" {
@@ -283,7 +283,7 @@ Let's assume a hypothetical GC--somewhat unimaginatively named "hypothetical-gc"
 --that requires that a TLS variable must be written to before and after a call
 to unmanaged code. The resulting relocation sequence is:
 
-.. code-block:: llvm
+.. code-block:: text
 
   @flag = thread_local global i32 0, align 4
 
@@ -662,7 +662,7 @@ distinguish between GC references and non-GC references in IR it is given.
 
 As an example, given this code:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
          gc "statepoint-example" {
@@ -672,7 +672,7 @@ As an example, given this code:
 
 The pass would produce this IR:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
          gc "statepoint-example" {
@@ -737,7 +737,7 @@ As an example, given input IR of the following:
 
 This pass would produce the following IR:
 
-.. code-block:: llvm
+.. code-block:: text
 
   define void @test() gc "statepoint-example" {
     %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0)
diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst
index a148634..c1391e7 100644
--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst
@@ -232,7 +232,7 @@ the record ends with a semicolon.
 
 Here is a simple TableGen file:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class C { bit V = 1; }
   def X : C;
@@ -276,7 +276,7 @@ derived class or definition wants to override.  Let expressions consist of the
 value.  For example, a new class could be added to the example above, redefining
 the ``V`` field for all of its subclasses:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class D : C { let V = 0; }
   def Z : D;
@@ -295,7 +295,7 @@ concrete classes.  Parameterized TableGen classes specify a list of variable
 bindings (which may optionally have defaults) that are bound when used.  Here is
 a simple example:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class FPFormat<bits<3> val> {
     bits<3> Value = val;
@@ -316,7 +316,7 @@ integer.
 The more esoteric forms of `TableGen expressions`_ are useful in conjunction
 with template arguments.  As an example:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class ModRefVal<bits<2> val> {
     bits<2> Value = val;
@@ -346,7 +346,7 @@ be used to decouple the interface provided to the user of the class from the
 actual internal data representation expected by the class.  In this case,
 running ``llvm-tblgen`` on the example prints the following definitions:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def bork {      // Value
     bit isMod = 1;
@@ -379,7 +379,7 @@ commonality exists, then in a separate place indicate what all the ops are.
 
 Here is an example TableGen fragment that shows this idea:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ops;
   def GPR;
@@ -405,7 +405,7 @@ inherit from multiple multiclasses, instantiating definitions from each
 multiclass.  Using a multiclass this way is exactly equivalent to instantiating
 the classes multiple times yourself, e.g. by writing:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ops;
   def GPR;
@@ -432,7 +432,7 @@ the classes multiple times yourself, e.g. by writing:
 A ``defm`` can also be used inside a multiclass providing several levels of
 multiclass instantiations.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class Instruction<bits<4> opc, string Name> {
     bits<4> opcode = opc;
@@ -473,7 +473,7 @@ multiclass instantiations.
 the class list must start after the last multiclass, and there must be at least
 one multiclass before them.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class XD { bits<4> Prefix = 11; }
   class XS { bits<4> Prefix = 12; }
@@ -516,7 +516,7 @@ specified file in place of the include directive.  The filename should be
 specified as a double quoted string immediately after the '``include``' keyword.
 Example:
 
-.. code-block:: llvm
+.. code-block:: text
 
   include "foo.td"
 
@@ -532,7 +532,7 @@ commonality from the records.
 File-scope "let" expressions take a comma-separated list of bindings to apply,
 and one or more records to bind the values in.  Here are some examples:
 
-.. code-block:: llvm
+.. code-block:: text
 
   let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in
     def RET : I<0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)]>;
@@ -559,7 +559,7 @@ ways to factor out commonality from the records, specially if using several
 levels of multiclass instantiations. This also avoids the need of using "let"
 expressions within subsequent records inside a multiclass.
 
-.. code-block:: llvm
+.. code-block:: text
 
   multiclass basic_r<bits<4> opc> {
     let Predicates = [HasSSE2] in {
@@ -587,7 +587,7 @@ TableGen supports the '``foreach``' block, which textually replicates the loop
 body, substituting iterator values for iterator references in the body.
 Example:
 
-.. code-block:: llvm
+.. code-block:: text
 
   foreach i = [0, 1, 2, 3] in {
     def R#i : Register<...>;
@@ -598,7 +598,7 @@ This will create objects ``R0``, ``R1``, ``R2`` and ``R3``.  ``foreach`` blocks
 may be nested. If there is only one item in the body the braces may be
 elided:
 
-.. code-block:: llvm
+.. code-block:: text
 
   foreach i = [0, 1, 2, 3] in
     def R#i : Register<...>;
diff --git a/docs/TableGen/index.rst b/docs/TableGen/index.rst
index 9526240..5ba555a 100644
--- a/docs/TableGen/index.rst
+++ b/docs/TableGen/index.rst
@@ -90,7 +90,7 @@ of the classes, then all of the definitions.  This is a good way to see what the
 various definitions expand to fully.  Running this on the ``X86.td`` file prints
 this (at the time of this writing):
 
-.. code-block:: llvm
+.. code-block:: text
 
   ...
   def ADD32rr {   // Instruction X86Inst I
@@ -155,7 +155,7 @@ by the code generator, and specifying it all manually would be unmaintainable,
 prone to bugs, and tiring to do in the first place.  Because we are using
 TableGen, all of the information was derived from the following definition:
 
-.. code-block:: llvm
+.. code-block:: text
 
   let Defs = [EFLAGS],
       isCommutable = 1,                  // X = ADD Y,Z --> X = ADD Z,Y
@@ -201,7 +201,7 @@ TableGen.
 **TableGen definitions** are the concrete form of 'records'.  These generally do
 not have any undefined values, and are marked with the '``def``' keyword.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
                                         "Enable ARMv8 FP">;
@@ -220,7 +220,7 @@ floating point instructions in the X86 backend).  TableGen keeps track of all of
 the classes that are used to build up a definition, so the backend can find all
 definitions of a particular class, such as "Instruction".
 
-.. code-block:: llvm
+.. code-block:: text
 
  class ProcNoItin<string Name, list<SubtargetFeature> Features>
        : Processor<Name, NoItineraries, Features>;
@@ -235,7 +235,7 @@ If a multiclass inherits from another multiclass, the definitions in the
 sub-multiclass become part of the current multiclass, as if they were declared
 in the current multiclass.
 
-.. code-block:: llvm
+.. code-block:: text
 
   multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
                           dag address, ValueType sty> {
diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index 023f6ff..f0f3ab5 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@@ -345,7 +345,7 @@ to define an object for each register.  The specified string ``n`` becomes the
 ``Name`` of the register.  The basic ``Register`` object does not have any
 subregisters and does not specify any aliases.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class Register<string n> {
     string Namespace = "";
@@ -361,7 +361,7 @@ subregisters and does not specify any aliases.
 For example, in the ``X86RegisterInfo.td`` file, there are register definitions
 that utilize the ``Register`` class, such as:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def AL : Register<"AL">, DwarfRegNum<[0, 0, 0]>;
 
@@ -414,7 +414,7 @@ classes.  In ``Target.td``, the ``Register`` class is the base for the
 ``RegisterWithSubRegs`` class that is used to define registers that need to
 specify subregisters in the ``SubRegs`` list, as shown here:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> {
     let SubRegs = subregs;
@@ -427,7 +427,7 @@ feature common to these subclasses.  Note the use of "``let``" expressions to
 override values that are initially defined in a superclass (such as ``SubRegs``
 field in the ``Rd`` class).
 
-.. code-block:: llvm
+.. code-block:: text
 
   class SparcReg<string n> : Register<n> {
     field bits<5> Num;
@@ -452,7 +452,7 @@ field in the ``Rd`` class).
 In the ``SparcRegisterInfo.td`` file, there are register definitions that
 utilize these subclasses of ``Register``, such as:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
   def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
@@ -478,7 +478,7 @@ default allocation order of the registers.  A target description file
 ``XXXRegisterInfo.td`` that uses ``Target.td`` can construct register classes
 using the following class:
 
-.. code-block:: llvm
+.. code-block:: text
 
   class RegisterClass<string namespace,
   list<ValueType> regTypes, int alignment, dag regList> {
@@ -532,7 +532,7 @@ defines a group of 32 single-precision floating-point registers (``F0`` to
 ``F31``); ``DFPRegs`` defines a group of 16 double-precision registers
 (``D0-D15``).
 
-.. code-block:: llvm
+.. code-block:: text
 
   // F0, F1, F2, ..., F31
   def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
@@ -703,7 +703,7 @@ which describes one instruction.  An instruction descriptor defines:
 The Instruction class (defined in ``Target.td``) is mostly used as a base for
 more complex instruction classes.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class Instruction {
     string Namespace = "";
@@ -760,7 +760,7 @@ specific operation value for ``LD``/Load Word.  The third parameter is the
 output destination, which is a register operand and defined in the ``Register``
 target description file (``IntRegs``).
 
-.. code-block:: llvm
+.. code-block:: text
 
   def LDrr : F3_1 <3, 0b000000, (outs IntRegs:$dst), (ins MEMrr:$addr),
                    "ld [$addr], $dst",
@@ -769,7 +769,7 @@ target description file (``IntRegs``).
 The fourth parameter is the input source, which uses the address operand
 ``MEMrr`` that is defined earlier in ``SparcInstrInfo.td``:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def MEMrr : Operand<i32> {
     let PrintMethod = "printMemOperand";
@@ -788,7 +788,7 @@ immediate value operands.  For example, to perform a Load Integer instruction
 for a Word from an immediate operand to a register, the following instruction
 class is defined:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def LDri : F3_2 <3, 0b000000, (outs IntRegs:$dst), (ins MEMri:$addr),
                    "ld [$addr], $dst",
@@ -801,7 +801,7 @@ creation of templates to define several instruction classes at once (using the
 pattern ``F3_12`` is defined to create 2 instruction classes each time
 ``F3_12`` is invoked:
 
-.. code-block:: llvm
+.. code-block:: text
 
   multiclass F3_12 <string OpcStr, bits<6> Op3Val, SDNode OpNode> {
     def rr  : F3_1 <2, Op3Val,
@@ -818,7 +818,7 @@ So when the ``defm`` directive is used for the ``XOR`` and ``ADD``
 instructions, as seen below, it creates four instruction objects: ``XORrr``,
 ``XORri``, ``ADDrr``, and ``ADDri``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   defm XOR   : F3_12<"xor", 0b000011, xor>;
   defm ADD   : F3_12<"add", 0b000000, add>;
@@ -830,7 +830,7 @@ For example, the 10\ :sup:`th` bit represents the "greater than" condition for
 integers, and the 22\ :sup:`nd` bit represents the "greater than" condition for
 floats.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ICC_NE  : ICC_VAL< 9>;  // Not Equal
   def ICC_E   : ICC_VAL< 1>;  // Equal
@@ -855,7 +855,7 @@ order they are defined.  Fields are bound when they are assigned a value.  For
 example, the Sparc target defines the ``XNORrr`` instruction as a ``F3_1``
 format instruction having three operands.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def XNORrr  : F3_1<2, 0b000111,
                      (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
@@ -865,7 +865,7 @@ format instruction having three operands.
 The instruction templates in ``SparcInstrFormats.td`` show the base class for
 ``F3_1`` is ``InstSP``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
     field bits<32> Inst;
@@ -880,7 +880,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for
 
 ``InstSP`` leaves the ``op`` field unbound.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
       : InstSP<outs, ins, asmstr, pattern> {
@@ -897,7 +897,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for
 fields.  ``F3`` format instructions will bind the operands ``rd``, ``op3``, and
 ``rs1`` fields.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
              string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
@@ -925,7 +925,7 @@ TableGen definition will add all of its operands to an enumeration in the
 llvm::XXX:OpName namespace and also add an entry for it into the OperandMap
 table, which can be queried using getNamedOperandIdx()
 
-.. code-block:: llvm
+.. code-block:: text
 
   int DstIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::dst); // => 0
   int BIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::b);     // => 1
@@ -972,7 +972,7 @@ For example, the X86 backend defines ``brtarget`` and ``brtarget8``, both
 instances of the TableGen ``Operand`` class, which represent branch target
 operands:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def brtarget : Operand<OtherVT>;
   def brtarget8 : Operand<OtherVT>;
@@ -1222,14 +1222,14 @@ definitions in ``XXXInstrInfo.td``.  For example, in ``SparcInstrInfo.td``,
 this entry defines a register store operation, and the last parameter describes
 a pattern with the store DAG operator.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def STrr  : F3_1< 3, 0b000100, (outs), (ins MEMrr:$addr, IntRegs:$src),
                    "st $src, [$addr]", [(store i32:$src, ADDRrr:$addr)]>;
 
 ``ADDRrr`` is a memory mode that is also defined in ``SparcInstrInfo.td``:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
 
@@ -1240,7 +1240,7 @@ defined in an implementation of the Instructor Selector (such as
 In ``lib/Target/TargetSelectionDAG.td``, the DAG operator for store is defined
 below:
 
-.. code-block:: llvm
+.. code-block:: text
 
   def store : PatFrag<(ops node:$val, node:$ptr),
                       (st node:$val, node:$ptr), [{
@@ -1458,7 +1458,7 @@ if the current argument is of type ``f32`` or ``f64``), then the action is
 performed.  In this case, the ``CCAssignToReg`` action assigns the argument
 value to the first available register: either ``R0`` or ``R1``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   CCIfType<[f32,f64], CCAssignToReg<[R0, R1]>>
 
@@ -1469,7 +1469,7 @@ which registers are used for specified scalar return types.  A single-precision
 float is returned to register ``F0``, and a double-precision float goes to
 register ``D0``.  A 32-bit integer is returned in register ``I0`` or ``I1``.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def RetCC_Sparc32 : CallingConv<[
     CCIfType<[i32], CCAssignToReg<[I0, I1]>>,
@@ -1484,7 +1484,7 @@ the size of the slot, and the second parameter, also 4, indicates the stack
 alignment along 4-byte units.  (Special cases: if size is zero, then the ABI
 size is used; if alignment is zero, then the ABI alignment is used.)
 
-.. code-block:: llvm
+.. code-block:: text
 
   def CC_Sparc32 : CallingConv<[
     // All arguments get passed in integer registers if there is space.
@@ -1499,7 +1499,7 @@ the following example (in ``X86CallingConv.td``), the definition of
 assigned to the register ``ST0`` or ``ST1``, the ``RetCC_X86Common`` is
 invoked.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def RetCC_X86_32_C : CallingConv<[
     CCIfType<[f32], CCAssignToReg<[ST0, ST1]>>,
@@ -1514,7 +1514,7 @@ then a specified action is invoked.  In the following example (in
 ``RetCC_X86_32_Fast`` is invoked.  If the ``SSECall`` calling convention is in
 use, then ``RetCC_X86_32_SSE`` is invoked.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def RetCC_X86_32 : CallingConv<[
     CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
@@ -1682,7 +1682,7 @@ feature, the value of the attribute, and a description of the feature.  (The
 fifth parameter is a list of features whose presence is implied, and its
 default value is an empty array.)
 
-.. code-block:: llvm
+.. code-block:: text
 
   class SubtargetFeature<string n, string a, string v, string d,
                          list<SubtargetFeature> i = []> {
@@ -1696,7 +1696,7 @@ default value is an empty array.)
 In the ``Sparc.td`` file, the ``SubtargetFeature`` is used to define the
 following features.
 
-.. code-block:: llvm
+.. code-block:: text
 
   def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true",
                        "Enable SPARC-V9 instructions">;
@@ -1710,7 +1710,7 @@ Elsewhere in ``Sparc.td``, the ``Proc`` class is defined and then is used to
 define particular SPARC processor subtypes that may have the previously
 described features.
 
-.. code-block:: llvm
+.. code-block:: text
 
   class Proc<string Name, list<SubtargetFeature> Features>
     : Processor<Name, NoItineraries, Features>;
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index 9e9d9f1..537bbbc 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -747,7 +747,7 @@ template parameter is the name of the pass that is to be used on the command
 line to specify that the pass should be added to a program (for example, with
 :program:`opt` or :program:`bugpoint`).  The first argument is the name of the
 pass, which is to be used for the :option:`-help` output of programs, as well
-as for debug output generated by the :option:`--debug-pass` option.
+as for debug output generated by the `--debug-pass` option.
 
 If you want your pass to be easily dumpable, you should implement the virtual
 print method:
diff --git a/docs/index.rst b/docs/index.rst
index ef1d4ec..a68dd1b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,6 @@
 Overview
 ========
 
-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 6bdb96a..76f8b31 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -2014,6 +2014,9 @@ void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA);
 
 void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                              LLVMAttributeRef A);
+unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx);
+void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef *Attrs);
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID);
@@ -2600,6 +2603,9 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
                               LLVMAttributeRef A);
+unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, LLVMAttributeIndex Idx);
+void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
+                               LLVMAttributeRef *Attrs);
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                               LLVMAttributeIndex Idx,
                                               unsigned KindID);
diff --git a/include/llvm/ADT/GraphTraits.h b/include/llvm/ADT/GraphTraits.h
index 823caef..eb67b7c 100644
--- a/include/llvm/ADT/GraphTraits.h
+++ b/include/llvm/ADT/GraphTraits.h
@@ -27,19 +27,24 @@ template<class GraphType>
 struct GraphTraits {
   // Elements to provide:
 
+  // NOTICE: We are in a transition from migration interfaces that require
+  // NodeType *, to NodeRef. NodeRef is required to be cheap to copy, but does
+  // not have to be a raw pointer. In the transition, user should define
+  // NodeType, and NodeRef = NodeType *.
+  //
   // typedef NodeType          - Type of Node in the graph
+  // typedef NodeRef           - NodeType *
   // typedef ChildIteratorType - Type used to iterate over children in graph
 
-  // static NodeType *getEntryNode(const GraphType &)
+  // static NodeRef getEntryNode(const GraphType &)
   //    Return the entry node of the graph
 
-  // static ChildIteratorType child_begin(NodeType *)
-  // static ChildIteratorType child_end  (NodeType *)
+  // static ChildIteratorType child_begin(NodeRef)
+  // static ChildIteratorType child_end  (NodeRef)
   //    Return iterators that point to the beginning and ending of the child
   //    node list for the specified node.
   //
 
-
   // typedef  ...iterator nodes_iterator;
   // static nodes_iterator nodes_begin(GraphType *G)
   // static nodes_iterator nodes_end  (GraphType *G)
@@ -57,7 +62,7 @@ struct GraphTraits {
   // your argument to XXX_begin(...) is unknown or needs to have the proper .h
   // file #include'd.
   //
-  typedef typename GraphType::UnknownGraphTypeError NodeType;
+  typedef typename GraphType::UnknownGraphTypeError NodeRef;
 };
 
 
diff --git a/include/llvm/ADT/SCCIterator.h b/include/llvm/ADT/SCCIterator.h
index bc74416..e89345c 100644
--- a/include/llvm/ADT/SCCIterator.h
+++ b/include/llvm/ADT/SCCIterator.h
@@ -37,23 +37,22 @@ namespace llvm {
 /// build up a vector of nodes in a particular SCC. Note that it is a forward
 /// iterator and thus you cannot backtrack or re-visit nodes.
 template <class GraphT, class GT = GraphTraits<GraphT>>
-class scc_iterator
-    : public iterator_facade_base<
-          scc_iterator<GraphT, GT>, std::forward_iterator_tag,
-          const std::vector<typename GT::NodeType *>, ptrdiff_t> {
-  typedef typename GT::NodeType NodeType;
+class scc_iterator : public iterator_facade_base<
+                         scc_iterator<GraphT, GT>, std::forward_iterator_tag,
+                         const std::vector<typename GT::NodeRef>, ptrdiff_t> {
+  typedef typename GT::NodeRef NodeRef;
   typedef typename GT::ChildIteratorType ChildItTy;
-  typedef std::vector<NodeType *> SccTy;
+  typedef std::vector<NodeRef> SccTy;
   typedef typename scc_iterator::reference reference;
 
   /// Element of VisitStack during DFS.
   struct StackElement {
-    NodeType *Node;       ///< The current node pointer.
+    NodeRef Node;         ///< The current node pointer.
     ChildItTy NextChild;  ///< The next child, modified inplace during DFS.
     unsigned MinVisited;  ///< Minimum uplink value of all children of Node.
 
-    StackElement(NodeType *Node, const ChildItTy &Child, unsigned Min)
-      : Node(Node), NextChild(Child), MinVisited(Min) {}
+    StackElement(NodeRef Node, const ChildItTy &Child, unsigned Min)
+        : Node(Node), NextChild(Child), MinVisited(Min) {}
 
     bool operator==(const StackElement &Other) const {
       return Node == Other.Node &&
@@ -67,10 +66,10 @@ class scc_iterator
   ///
   /// nodeVisitNumbers are per-node visit numbers, also used as DFS flags.
   unsigned visitNum;
-  DenseMap<NodeType *, unsigned> nodeVisitNumbers;
+  DenseMap<NodeRef, unsigned> nodeVisitNumbers;
 
   /// Stack holding nodes of the SCC.
-  std::vector<NodeType *> SCCNodeStack;
+  std::vector<NodeRef> SCCNodeStack;
 
   /// The current SCC, retrieved using operator*().
   SccTy CurrentSCC;
@@ -80,7 +79,7 @@ class scc_iterator
   std::vector<StackElement> VisitStack;
 
   /// A single "visit" within the non-recursive DFS traversal.
-  void DFSVisitOne(NodeType *N);
+  void DFSVisitOne(NodeRef N);
 
   /// The stack-based DFS traversal; defined below.
   void DFSVisitChildren();
@@ -88,7 +87,7 @@ class scc_iterator
   /// Compute the next SCC using the DFS traversal.
   void GetNextSCC();
 
-  scc_iterator(NodeType *entryN) : visitNum(0) {
+  scc_iterator(NodeRef entryN) : visitNum(0) {
     DFSVisitOne(entryN);
     GetNextSCC();
   }
@@ -131,7 +130,7 @@ public:
 
   /// This informs the \c scc_iterator that the specified \c Old node
   /// has been deleted, and \c New is to be used in its place.
-  void ReplaceNode(NodeType *Old, NodeType *New) {
+  void ReplaceNode(NodeRef Old, NodeRef New) {
     assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?");
     nodeVisitNumbers[New] = nodeVisitNumbers[Old];
     nodeVisitNumbers.erase(Old);
@@ -139,7 +138,7 @@ public:
 };
 
 template <class GraphT, class GT>
-void scc_iterator<GraphT, GT>::DFSVisitOne(NodeType *N) {
+void scc_iterator<GraphT, GT>::DFSVisitOne(NodeRef N) {
   ++visitNum;
   nodeVisitNumbers[N] = visitNum;
   SCCNodeStack.push_back(N);
@@ -155,8 +154,8 @@ void scc_iterator<GraphT, GT>::DFSVisitChildren() {
   assert(!VisitStack.empty());
   while (VisitStack.back().NextChild != GT::child_end(VisitStack.back().Node)) {
     // TOS has at least one more child so continue DFS
-    NodeType *childN = *VisitStack.back().NextChild++;
-    typename DenseMap<NodeType *, unsigned>::iterator Visited =
+    NodeRef childN = *VisitStack.back().NextChild++;
+    typename DenseMap<NodeRef, unsigned>::iterator Visited =
         nodeVisitNumbers.find(childN);
     if (Visited == nodeVisitNumbers.end()) {
       // this node has never been seen.
@@ -176,7 +175,7 @@ template <class GraphT, class GT> void scc_iterator<GraphT, GT>::GetNextSCC() {
     DFSVisitChildren();
 
     // Pop the leaf on top of the VisitStack.
-    NodeType *visitingN = VisitStack.back().Node;
+    NodeRef visitingN = VisitStack.back().Node;
     unsigned minVisitNum = VisitStack.back().MinVisited;
     assert(VisitStack.back().NextChild == GT::child_end(visitingN));
     VisitStack.pop_back();
@@ -212,7 +211,7 @@ bool scc_iterator<GraphT, GT>::hasLoop() const {
     assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!");
     if (CurrentSCC.size() > 1)
       return true;
-    NodeType *N = CurrentSCC.front();
+    NodeRef N = CurrentSCC.front();
     for (ChildItTy CI = GT::child_begin(N), CE = GT::child_end(N); CI != CE;
          ++CI)
       if (*CI == N)
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index abd39da..00b796f 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -26,10 +26,18 @@
 #include <memory>
 #include <utility> // for std::pair
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
+namespace detail {
+
+template <typename RangeT>
+using IterOfRange = decltype(std::begin(std::declval<RangeT>()));
+
+} // End detail namespace
 
 //===----------------------------------------------------------------------===//
 //     Extra additions to <functional>
@@ -235,6 +243,90 @@ auto reverse(
                     llvm::make_reverse_iterator(std::begin(C)));
 }
 
+/// An iterator adaptor that filters the elements of given inner iterators.
+///
+/// The predicate parameter should be a callable object that accepts the wrapped
+/// iterator's reference type and returns a bool. When incrementing or
+/// decrementing the iterator, it will call the predicate on each element and
+/// skip any where it returns false.
+///
+/// \code
+///   int A[] = { 1, 2, 3, 4 };
+///   auto R = make_filter_range(A, [](int N) { return N % 2 == 1; });
+///   // R contains { 1, 3 }.
+/// \endcode
+template <typename WrappedIteratorT, typename PredicateT>
+class filter_iterator
+    : public iterator_adaptor_base<
+          filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+          typename std::common_type<
+              std::forward_iterator_tag,
+              typename std::iterator_traits<
+                  WrappedIteratorT>::iterator_category>::type> {
+  using BaseT = iterator_adaptor_base<
+      filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+      typename std::common_type<
+          std::forward_iterator_tag,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category>::
+          type>;
+
+  struct PayloadType {
+    WrappedIteratorT End;
+    PredicateT Pred;
+  };
+
+  Optional<PayloadType> Payload;
+
+  void findNextValid() {
+    assert(Payload && "Payload should be engaged when findNextValid is called");
+    while (this->I != Payload->End && !Payload->Pred(*this->I))
+      BaseT::operator++();
+  }
+
+  // Construct the begin iterator. The begin iterator requires to know where end
+  // is, so that it can properly stop when it hits end.
+  filter_iterator(WrappedIteratorT Begin, WrappedIteratorT End, PredicateT Pred)
+      : BaseT(std::move(Begin)),
+        Payload(PayloadType{std::move(End), std::move(Pred)}) {
+    findNextValid();
+  }
+
+  // Construct the end iterator. It's not incrementable, so Payload doesn't
+  // have to be engaged.
+  filter_iterator(WrappedIteratorT End) : BaseT(End) {}
+
+public:
+  using BaseT::operator++;
+
+  filter_iterator &operator++() {
+    BaseT::operator++();
+    findNextValid();
+    return *this;
+  }
+
+  template <typename RT, typename PT>
+  friend iterator_range<filter_iterator<detail::IterOfRange<RT>, PT>>
+  make_filter_range(RT &&, PT);
+};
+
+/// Convenience function that takes a range of elements and a predicate,
+/// and return a new filter_iterator range.
+///
+/// FIXME: Currently if RangeT && is a rvalue reference to a temporary, the
+/// lifetime of that temporary is not kept by the returned range object, and the
+/// temporary is going to be dropped on the floor after the make_iterator_range
+/// full expression that contains this function call.
+template <typename RangeT, typename PredicateT>
+iterator_range<filter_iterator<detail::IterOfRange<RangeT>, PredicateT>>
+make_filter_range(RangeT &&Range, PredicateT Pred) {
+  using FilterIteratorT =
+      filter_iterator<detail::IterOfRange<RangeT>, PredicateT>;
+  return make_range(FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
+                                    std::end(std::forward<RangeT>(Range)),
+                                    std::move(Pred)),
+                    FilterIteratorT(std::end(std::forward<RangeT>(Range))));
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <utility>
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 4781304..b98f840 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -174,6 +174,7 @@ public:
     UnknownEnvironment,
 
     GNU,
+    GNUABI64,
     GNUEABI,
     GNUEABIHF,
     GNUX32,
@@ -476,8 +477,9 @@ public:
 
   bool isGNUEnvironment() const {
     EnvironmentType Env = getEnvironment();
-    return Env == Triple::GNU || Env == Triple::GNUEABI ||
-           Env == Triple::GNUEABIHF || Env == Triple::GNUX32;
+    return Env == Triple::GNU || Env == Triple::GNUABI64 ||
+           Env == Triple::GNUEABI || Env == Triple::GNUEABIHF ||
+           Env == Triple::GNUX32;
   }
 
   /// Checks if the environment could be MSVC.
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index 2898a67..0bd28d5 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -155,7 +155,14 @@ template <
     typename T = typename std::iterator_traits<WrappedIteratorT>::value_type,
     typename DifferenceTypeT =
         typename std::iterator_traits<WrappedIteratorT>::difference_type,
-    typename PointerT = T *, typename ReferenceT = T &,
+    typename PointerT = typename std::conditional<
+        std::is_same<T, typename std::iterator_traits<
+                            WrappedIteratorT>::value_type>::value,
+        typename std::iterator_traits<WrappedIteratorT>::pointer, T *>::type,
+    typename ReferenceT = typename std::conditional<
+        std::is_same<T, typename std::iterator_traits<
+                            WrappedIteratorT>::value_type>::value,
+        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type,
     // Don't provide these, they are mostly to act as aliases below.
     typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
 class iterator_adaptor_base
@@ -168,15 +175,7 @@ protected:
 
   iterator_adaptor_base() = default;
 
-  template <typename U>
-  explicit iterator_adaptor_base(
-      U &&u,
-      typename std::enable_if<
-          !std::is_base_of<typename std::remove_cv<
-                               typename std::remove_reference<U>::type>::type,
-                           DerivedT>::value,
-          int>::type = 0)
-      : I(std::forward<U &&>(u)) {}
+  explicit iterator_adaptor_base(WrappedIteratorT u) : I(std::move(u)) {}
 
   const WrappedIteratorT &wrapped() const { return I; }
 
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 4ecacb0..f37e843 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -410,6 +410,7 @@ public:
 // traversals.
 template <> struct GraphTraits<CallGraphNode *> {
   typedef CallGraphNode NodeType;
+  typedef CallGraphNode *NodeRef;
 
   typedef CallGraphNode::CallRecord CGNPairTy;
   typedef std::pointer_to_unary_function<CGNPairTy, CallGraphNode *>
@@ -431,6 +432,7 @@ template <> struct GraphTraits<CallGraphNode *> {
 
 template <> struct GraphTraits<const CallGraphNode *> {
   typedef const CallGraphNode NodeType;
+  typedef const CallGraphNode *NodeRef;
 
   typedef CallGraphNode::CallRecord CGNPairTy;
   typedef std::pointer_to_unary_function<CGNPairTy, const CallGraphNode *>
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index 2fa856a..1acf952 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -196,6 +196,13 @@ namespace llvm {
     /// block.
     Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I);
 
+    /// \brief Insert code to directly compute the specified SCEV expression
+    /// into the program.  The inserted code is inserted into the SCEVExpander's
+    /// current insertion point. If a type is specified, the result will be
+    /// expanded to have that type, with a cast if necessary.
+    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
+
+
     /// \brief Generates a code sequence that evaluates this predicate.
     /// The inserted instructions will be at position \p Loc.
     /// The result will be of type i1 and will have a value of 0 when the
@@ -253,6 +260,15 @@ namespace llvm {
 
     void enableLSRMode() { LSRMode = true; }
 
+    /// \brief Set the current insertion point. This is useful if multiple calls
+    /// to expandCodeFor() are going to be made with the same insert point and
+    /// the insert point may be moved during one of the expansions (e.g. if the
+    /// insert point is not a block terminator).
+    void setInsertPoint(Instruction *IP) {
+      assert(IP);
+      Builder.SetInsertPoint(IP);
+    }
+
     /// \brief Clear the current insertion point. This is useful if the
     /// instruction that had been serving as the insertion point may have been
     /// deleted.
@@ -313,12 +329,6 @@ namespace llvm {
 
     Value *expand(const SCEV *S);
 
-    /// \brief Insert code to directly compute the specified SCEV expression
-    /// into the program.  The inserted code is inserted into the SCEVExpander's
-    /// current insertion point. If a type is specified, the result will be
-    /// expanded to have that type, with a cast if necessary.
-    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
-
     /// \brief Determine the most "relevant" loop for the given SCEV.
     const Loop *getRelevantLoop(const SCEV *);
 
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index d5f918e..2923371 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -740,6 +740,7 @@ struct MBB2NumberFunctor :
 
 template <> struct GraphTraits<MachineBasicBlock *> {
   typedef MachineBasicBlock NodeType;
+  typedef MachineBasicBlock *NodeRef;
   typedef MachineBasicBlock::succ_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(MachineBasicBlock *BB) { return BB; }
@@ -753,6 +754,7 @@ template <> struct GraphTraits<MachineBasicBlock *> {
 
 template <> struct GraphTraits<const MachineBasicBlock *> {
   typedef const MachineBasicBlock NodeType;
+  typedef const MachineBasicBlock *NodeRef;
   typedef MachineBasicBlock::const_succ_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(const MachineBasicBlock *BB) { return BB; }
@@ -772,6 +774,7 @@ template <> struct GraphTraits<const MachineBasicBlock *> {
 //
 template <> struct GraphTraits<Inverse<MachineBasicBlock*> > {
   typedef MachineBasicBlock NodeType;
+  typedef MachineBasicBlock *NodeRef;
   typedef MachineBasicBlock::pred_iterator ChildIteratorType;
   static NodeType *getEntryNode(Inverse<MachineBasicBlock *> G) {
     return G.Graph;
@@ -786,6 +789,7 @@ template <> struct GraphTraits<Inverse<MachineBasicBlock*> > {
 
 template <> struct GraphTraits<Inverse<const MachineBasicBlock*> > {
   typedef const MachineBasicBlock NodeType;
+  typedef const MachineBasicBlock *NodeRef;
   typedef MachineBasicBlock::const_pred_iterator ChildIteratorType;
   static NodeType *getEntryNode(Inverse<const MachineBasicBlock*> G) {
     return G.Graph;
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index af1bf0a..5ef0371 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -210,6 +210,7 @@ public:
 private:
   friend class AttrBuilder;
   friend class AttributeSetImpl;
+  friend class AttributeSetNode;
   template <typename Ty> friend struct DenseMapInfo;
 
   /// \brief The attributes that we are managing. This can be null to represent
diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h
index e9bf093..a256b59 100644
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@@ -155,6 +155,7 @@ struct isPodLike<TerminatorInst::SuccIterator<T, U>> {
 
 template <> struct GraphTraits<BasicBlock*> {
   typedef BasicBlock NodeType;
+  typedef BasicBlock *NodeRef;
   typedef succ_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(BasicBlock *BB) { return BB; }
@@ -168,6 +169,7 @@ template <> struct GraphTraits<BasicBlock*> {
 
 template <> struct GraphTraits<const BasicBlock*> {
   typedef const BasicBlock NodeType;
+  typedef const BasicBlock *NodeRef;
   typedef succ_const_iterator ChildIteratorType;
 
   static NodeType *getEntryNode(const BasicBlock *BB) { return BB; }
@@ -187,6 +189,7 @@ template <> struct GraphTraits<const BasicBlock*> {
 //
 template <> struct GraphTraits<Inverse<BasicBlock*> > {
   typedef BasicBlock NodeType;
+  typedef BasicBlock *NodeRef;
   typedef pred_iterator ChildIteratorType;
   static NodeType *getEntryNode(Inverse<BasicBlock *> G) { return G.Graph; }
   static inline ChildIteratorType child_begin(NodeType *N) {
@@ -199,6 +202,7 @@ template <> struct GraphTraits<Inverse<BasicBlock*> > {
 
 template <> struct GraphTraits<Inverse<const BasicBlock*> > {
   typedef const BasicBlock NodeType;
+  typedef const BasicBlock *NodeRef;
   typedef const_pred_iterator ChildIteratorType;
   static NodeType *getEntryNode(Inverse<const BasicBlock*> G) {
     return G.Graph;
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 74c9715..b965f08 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -479,6 +479,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
   def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
@@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
   def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
         Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
   def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+  def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
+        Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 }
 
 // Vector bit test
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index d21d321..4586a17 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2349,6 +2349,10 @@ public:
   /// from getBooleanContents().
   bool isConstFalseVal(const SDNode *N) const;
 
+  /// Return a constant of type VT that contains a true value that respects
+  /// getBooleanContents()
+  SDValue getConstTrueVal(SelectionDAG &DAG, EVT VT, const SDLoc &DL) const;
+
   /// Return if \p N is a True value when extended to \p VT.
   bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool Signed) const;
 
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 90bc249..c2039e1 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -623,6 +623,7 @@ template <> struct GraphTraits<IrreducibleGraph> {
   typedef bfi_detail::IrreducibleGraph GraphT;
 
   typedef const GraphT::IrrNode NodeType;
+  typedef const GraphT::IrrNode *NodeRef;
   typedef GraphT::IrrNode::iterator ChildIteratorType;
 
   static const NodeType *getEntryNode(const GraphT &G) {
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 6c471ab..c9adaa7 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
 /// integer type Ty is used to select how many bits are available for the
 /// result. Returns null if the conversion cannot be performed, otherwise
 /// returns the Constant value resulting from the conversion.
-Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
-                                   Type *Ty) {
+Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
+                                      Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
   unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
@@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
   APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
                                                   /*isSigned=*/true, mode,
                                                   &isExact);
-  if (status != APFloat::opOK && status != APFloat::opInexact)
+  if (status != APFloat::opOK &&
+      (!roundTowardZero || status != APFloat::opInexact))
     return nullptr;
   return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
@@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
       case Intrinsic::x86_sse2_cvtsd2si:
       case Intrinsic::x86_sse2_cvtsd2si64:
         if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/false, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/false, Ty);
       case Intrinsic::x86_sse_cvttss2si:
       case Intrinsic::x86_sse_cvttss2si64:
       case Intrinsic::x86_sse2_cvttsd2si:
       case Intrinsic::x86_sse2_cvttsd2si64:
         if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/true, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/true, Ty);
       }
     }
 
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 0cb2c78..aeaf938 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3400,7 +3400,10 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
     return TrueVal;
 
   if (const auto *ICI = dyn_cast<ICmpInst>(CondVal)) {
-    unsigned BitWidth = Q.DL.getTypeSizeInBits(TrueVal->getType());
+    // FIXME: This code is nearly duplicated in InstCombine. Using/refactoring
+    // decomposeBitTestICmp() might help.
+    unsigned BitWidth =
+        Q.DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
     ICmpInst::Predicate Pred = ICI->getPredicate();
     Value *CmpLHS = ICI->getOperand(0);
     Value *CmpRHS = ICI->getOperand(1);
@@ -4274,7 +4277,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
       I->eraseFromParent();
   } else {
     Worklist.insert(I);
@@ -4302,7 +4306,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 
     // Gracefully handle edge cases where the instruction is not wired into any
     // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
       I->eraseFromParent();
   }
   return Simplified;
diff --git a/lib/Analysis/LoopUnrollAnalyzer.cpp b/lib/Analysis/LoopUnrollAnalyzer.cpp
index f59257a..7bdf340 100644
--- a/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -115,13 +115,19 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
   // We might have a vector load from an array. FIXME: for now we just bail
   // out in this case, but we should be able to resolve and simplify such
   // loads.
-  if(CDS->getElementType() != I.getType())
+  if (CDS->getElementType() != I.getType())
     return false;
 
-  int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-  if (SimplifiedAddrOp->getValue().getActiveBits() >= 64)
+  unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+  if (SimplifiedAddrOp->getValue().getActiveBits() > 64)
     return false;
-  int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+  int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue();
+  if (SimplifiedAddrOpV < 0) {
+    // FIXME: For now we conservatively ignore out of bound accesses, but
+    // we're allowed to perform the optimization in this case.
+    return false;
+  }
+  uint64_t Index = static_cast<uint64_t>(SimplifiedAddrOpV) / ElemSize;
   if (Index >= CDS->getNumElements()) {
     // FIXME: For now we conservatively ignore out of bound accesses, but
     // we're allowed to perform the optimization in this case.
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 77e4ec7..2e45bb8 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1610,8 +1610,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
 
 Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
                                    Instruction *IP) {
-  assert(IP);
-  Builder.SetInsertPoint(IP);
+  setInsertPoint(IP);
   return expandCodeFor(SH, Ty);
 }
 
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index b0ba571..ebf80de 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -214,10 +214,7 @@ TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
 }
 
 TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
-  // It's possible to ask for the FuncId of a function which doesn't have a
-  // subprogram: inlining a function with debug info into a function with none.
-  if (!SP)
-    return TypeIndex::None();
+  assert(SP);
 
   // Check if we've already translated this subprogram.
   auto I = TypeIndices.find({SP, nullptr});
@@ -621,11 +618,12 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   std::string FuncName;
   auto *SP = GV->getSubprogram();
+  assert(SP);
   setCurrentSubprogram(SP);
 
   // If we have a display name, build the fully qualified name by walking the
   // chain of scopes.
-  if (SP != nullptr && !SP->getDisplayName().empty())
+  if (!SP->getDisplayName().empty())
     FuncName =
         getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());
 
@@ -864,7 +862,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 void CodeViewDebug::beginFunction(const MachineFunction *MF) {
   assert(!CurFn && "Can't process two functions at once!");
 
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
     return;
 
   DebugHandlerBase::beginFunction(MF);
@@ -1939,7 +1937,8 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
 
   // Ignore DBG_VALUE locations and function prologue.
-  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
+  if (!Asm || !CurFn || MI->isDebugValue() ||
+      MI->getFlag(MachineInstr::FrameSetup))
     return;
   DebugLoc DL = MI->getDebugLoc();
   if (DL == PrevInstLoc || !DL)
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fa70576..23e2aa7 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -996,6 +996,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
     MachineBasicBlock *IBB = &*I;
     MachineBasicBlock *PredBB = &*std::prev(I);
     MergePotentials.clear();
+    MachineLoop *ML;
+
+    // Bail if merging after placement and IBB is the loop header because
+    // -- If merging predecessors that belong to the same loop as IBB, the
+    // common tail of merged predecessors may become the loop top if block
+    // placement is called again and the predecessors may branch to this common
+    // tail and require more branches. This can be relaxed if
+    // MachineBlockPlacement::findBestLoopTop is more flexible.
+    // --If merging predecessors that do not belong to the same loop as IBB, the
+    // loop info of IBB's loop and the other loops may be affected. Calling the
+    // block placement again may make big change to the layout and eliminate the
+    // reason to do tail merging here.
+    if (AfterBlockPlacement && MLI) {
+      ML = MLI->getLoopFor(IBB);
+      if (ML && IBB == ML->getHeader())
+        continue;
+    }
+
     for (MachineBasicBlock *PBB : I->predecessors()) {
       if (MergePotentials.size() == TailMergeThreshold)
         break;
@@ -1015,16 +1033,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (PBB->hasEHPadSuccessor())
         continue;
 
-      // Bail out if the loop header (IBB) is not the top of the loop chain
-      // after the block placement.  Otherwise, the common tail of IBB's
-      // predecessors may become the loop top if block placement is called again
-      // and the predecessors may branch to this common tail.
-      // FIXME: Relaxed this check if the algorithm of finding loop top is
-      // changed in MBP.
+      // After block placement, only consider predecessors that belong to the
+      // same loop as IBB.  The reason is the same as above when skipping loop
+      // header.
       if (AfterBlockPlacement && MLI)
-        if (MachineLoop *ML = MLI->getLoopFor(IBB))
-          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
-            continue;
+        if (ML != MLI->getLoopFor(PBB))
+          continue;
 
       MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
       SmallVector<MachineOperand, 4> Cond;
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 19cd59b..4a1b995 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -530,7 +530,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Align =
         std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
-                  Align, SSC.getLiveRange(StackGuardSlot));
+                  Align, SSC.getFullLiveRange());
   }
 
   for (Argument *Arg : ByValArguments) {
diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp
index 709614f..795eb8d 100644
--- a/lib/CodeGen/SafeStackColoring.cpp
+++ b/lib/CodeGen/SafeStackColoring.cpp
@@ -25,7 +25,9 @@ static cl::opt<bool> ClColoring("safe-stack-coloring",
                                 cl::Hidden, cl::init(true));
 
 const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
-  return LiveRanges[AllocaNumbering[AI]];
+  const auto IT = AllocaNumbering.find(AI);
+  assert(IT != AllocaNumbering.end());
+  return LiveRanges[IT->second];
 }
 
 bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
diff --git a/lib/CodeGen/SafeStackLayout.cpp b/lib/CodeGen/SafeStackLayout.cpp
index b8190e0..fb433c1 100644
--- a/lib/CodeGen/SafeStackLayout.cpp
+++ b/lib/CodeGen/SafeStackLayout.cpp
@@ -100,7 +100,8 @@ void StackLayout::layoutObject(StackObject &Obj) {
   }
 
   // Split starting and ending regions if necessary.
-  for (StackRegion &R : Regions) {
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    StackRegion &R = Regions[i];
     if (Start > R.Start && Start < R.End) {
       StackRegion R0 = R;
       R.Start = R0.End = Start;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d888676..5ecc6da 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6198,13 +6198,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       }
     }
 
-    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0)
-    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
+    // Here, T can be 1 or -1, depending on the type of the setcc and
+    // getBooleanContents().
+    unsigned SetCCWidth = N0.getValueType().getScalarSizeInBits();
+
     SDLoc DL(N);
-    SDValue NegOne =
-      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT);
+    // To determine the "true" side of the select, we need to know the high bit
+    // of the value returned by the setcc if it evaluates to true.
+    // If the type of the setcc is i1, then the true case of the select is just
+    // sext(i1 1), that is, -1.
+    // If the type of the setcc is larger (say, i8) then the value of the high
+    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // of the appropriate width.
+    SDValue ExtTrueVal =
+        (SetCCWidth == 1)
+            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
+                              DL, VT)
+            : TLI.getConstTrueVal(DAG, VT, DL);
+
     if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), NegOne,
+            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
             DAG.getConstant(0, DL, VT),
             cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
       return SCC;
@@ -6215,10 +6229,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
           TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
         SDLoc DL(N);
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
-                                     N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC,
-                             NegOne, DAG.getConstant(0, DL, VT));
+        SDValue SetCC =
+            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
+                             DAG.getConstant(0, DL, VT));
       }
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8235522..29d11c7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6639,19 +6639,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
   SDNode *FromNode = From.getNode();
   SDNode *ToNode = To.getNode();
   ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
        I != E; ++I) {
     SDDbgValue *Dbg = *I;
     // Only add Dbgvalues attached to same ResNo.
     if (Dbg->getKind() == SDDbgValue::SDNODE &&
-        Dbg->getResNo() == From.getResNo()) {
+        Dbg->getSDNode() == From.getNode() &&
+        Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) {
+      assert(FromNode != ToNode &&
+             "Should not transfer Debug Values intranode");
       SDDbgValue *Clone =
           getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
                       To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
                       Dbg->getDebugLoc(), Dbg->getOrder());
-      AddDbgValue(Clone, ToNode, false);
+      ClonedDVs.push_back(Clone);
+      Dbg->setIsInvalidated();
     }
   }
+  for (SDDbgValue *I : ClonedDVs)
+    AddDbgValue(I, ToNode, false);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f2bc88a..806646f 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1234,6 +1234,16 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   llvm_unreachable("Invalid boolean contents");
 }
 
+SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT,
+                                        const SDLoc &DL) const {
+  unsigned ElementWidth = VT.getScalarSizeInBits();
+  APInt TrueInt =
+      getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent
+          ? APInt(ElementWidth, 1)
+          : APInt::getAllOnesValue(ElementWidth);
+  return DAG.getConstant(TrueInt, DL, VT);
+}
+
 bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   if (!N)
     return false;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 3d9a518..8feb18b 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -29,7 +29,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -539,6 +539,16 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   return TRI->regsOverlap(RegA, RegB);
 }
 
+// Returns true if Reg is equal or aliased to at least one register in Set.
+static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg,
+                           const TargetRegisterInfo *TRI) {
+  for (unsigned R : Set)
+    if (TRI->regsOverlap(R, Reg))
+      return true;
+
+  return false;
+}
+
 /// Return true if it's potentially profitable to commute the two-address
 /// instruction that's being processed.
 bool
@@ -864,9 +874,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // FIXME: Needs more sophisticated heuristics.
     return false;
 
-  SmallSet<unsigned, 2> Uses;
-  SmallSet<unsigned, 2> Kills;
-  SmallSet<unsigned, 2> Defs;
+  SmallVector<unsigned, 2> Uses;
+  SmallVector<unsigned, 2> Kills;
+  SmallVector<unsigned, 2> Defs;
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
@@ -874,12 +884,12 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     if (!MOReg)
       continue;
     if (MO.isDef())
-      Defs.insert(MOReg);
+      Defs.push_back(MOReg);
     else {
-      Uses.insert(MOReg);
+      Uses.push_back(MOReg);
       if (MOReg != Reg && (MO.isKill() ||
                            (LIS && isPlainlyKilled(MI, MOReg, LIS))))
-        Kills.insert(MOReg);
+        Kills.push_back(MOReg);
     }
   }
 
@@ -888,8 +898,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   MachineBasicBlock::iterator AfterMI = std::next(Begin);
 
   MachineBasicBlock::iterator End = AfterMI;
-  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
-    Defs.insert(End->getOperand(0).getReg());
+  while (End->isCopy() &&
+         regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI)) {
+    Defs.push_back(End->getOperand(0).getReg());
     ++End;
   }
 
@@ -915,21 +926,21 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
       if (!MOReg)
         continue;
       if (MO.isDef()) {
-        if (Uses.count(MOReg))
+        if (regOverlapsSet(Uses, MOReg, TRI))
           // Physical register use would be clobbered.
           return false;
-        if (!MO.isDead() && Defs.count(MOReg))
+        if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI))
           // May clobber a physical register def.
           // FIXME: This may be too conservative. It's ok if the instruction
           // is sunken completely below the use.
           return false;
       } else {
-        if (Defs.count(MOReg))
+        if (regOverlapsSet(Defs, MOReg, TRI))
           return false;
         bool isKill =
             MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS));
-        if (MOReg != Reg &&
-            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
+        if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) ||
+                             regOverlapsSet(Kills, MOReg, TRI)))
           // Don't want to extend other live ranges and update kills.
           return false;
         if (MOReg == Reg && !isKill)
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 267a0da..d58bff5 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -19,8 +19,8 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/TrailingObjects.h"
 #include <climits>
 #include <string>
 
@@ -142,73 +142,6 @@ public:
   StringRef getStringValue() const { return Val; }
 };
 
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-  // AttributesSetNode is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetNode &) = delete;
-  AttributeSetNode(const AttributeSetNode &) = delete;
-public:
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
-      AttrList[I].Profile(ID);
-  }
-};
-
 typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/AttributeSetNode.h b/lib/IR/AttributeSetNode.h
new file mode 100644
index 0000000..fab1ed5
--- /dev/null
+++ b/lib/IR/AttributeSetNode.h
@@ -0,0 +1,98 @@
+//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the node class used internally by AttributeSet.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_ATTRIBUTESETNODE_H
+#define LLVM_IR_ATTRIBUTESETNODE_H
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <climits>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  unsigned NumAttrs; ///< Number of attributes in this node.
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
+    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
+                  "Too many attributes for AvailableAttrs");
+    // There's memory after the node where we can store the entries in.
+    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+    for (Attribute I : *this) {
+      if (!I.isStringAttribute()) {
+        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+      }
+    }
+  }
+
+  // AttributesSetNode is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetNode &) = delete;
+  AttributeSetNode(const AttributeSetNode &) = delete;
+public:
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
+    return AS.getAttributes(Index);
+  }
+
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
+      AttrList[I].Profile(ID);
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 431e51b..2e4a2f8 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
          Name == "sse2.cvtps2pd" ||
          Name == "avx.cvtdq2.pd.256" ||
          Name == "avx.cvt.ps2.pd.256" ||
-         Name == "sse2.cvttps2dq" ||
-         Name.startswith("avx.cvtt.") ||
          Name.startswith("avx.vinsertf128.") ||
          Name == "avx2.vinserti128" ||
          Name.startswith("avx.vextractf128.") ||
@@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
       else
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
-    } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
-                         Name.startswith("avx.cvtt."))) {
-      // Truncation (round to zero) float/double to i32 vector conversion.
-      Value *Src = CI->getArgOperand(0);
-      VectorType *DstTy = cast<VectorType>(CI->getType());
-      Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
     } else if (IsX86 && Name.startswith("sse4a.movnt.")) {
       Module *M = F->getParent();
       SmallVector<Metadata *, 1> Elts;
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index a553614..3c4b0cf 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -1844,6 +1845,18 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
   unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
 }
 
+unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef *Attrs) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID) {
@@ -2216,6 +2229,21 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
   CallSite(unwrap<Instruction>(C)).addAttribute(Idx, unwrap(A));
 }
 
+unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
+                                       LLVMAttributeIndex Idx) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
+                               LLVMAttributeRef *Attrs) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                               LLVMAttributeIndex Idx,
                                               unsigned KindID) {
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 5201c2e..f35c64b 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -675,8 +675,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) {
   Metadata *Old = getOperand(Op);
   setOperand(Op, New);
 
-  // Drop uniquing for self-reference cycles.
-  if (New == this) {
+  // Drop uniquing for self-reference cycles and deleted constants.
+  if (New == this || (!New && Old && isa<ConstantAsMetadata>(Old))) {
     if (!isResolved())
       resolve();
     storeDistinctInContext();
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index cfa12a9..2bac2a3 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -201,6 +201,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   switch (Kind) {
   case UnknownEnvironment: return "unknown";
   case GNU: return "gnu";
+  case GNUABI64: return "gnuabi64";
   case GNUEABIHF: return "gnueabihf";
   case GNUEABI: return "gnueabi";
   case GNUX32: return "gnux32";
@@ -468,6 +469,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
   return StringSwitch<Triple::EnvironmentType>(EnvironmentName)
     .StartsWith("eabihf", Triple::EABIHF)
     .StartsWith("eabi", Triple::EABI)
+    .StartsWith("gnuabi64", Triple::GNUABI64)
     .StartsWith("gnueabihf", Triple::GNUEABIHF)
     .StartsWith("gnueabi", Triple::GNUEABI)
     .StartsWith("gnux32", Triple::GNUX32)
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index b1e8816..b97a0f1 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -250,6 +250,7 @@ def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
                                    FeatureMacroOpFusion,
                                    FeatureNEON,
                                    FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
                                    HasV8_1aOps]>;
 
 def : ProcessorModel<"generic", NoSchedModel, [
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d6f2a19..ac7de1b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 /// Fold a floating-point multiply by power of two into floating-point to
 /// fixed-point conversion.
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
@@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
     break;
   case 4:
-    ResTy = MVT::v4i32;
+    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
     break;
   }
 
+  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+         "Illegal vector type after legalization");
+
   SDLoc DL(N);
   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
@@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
-    return performFpToIntCombine(N, DAG, Subtarget);
+    return performFpToIntCombine(N, DAG, DCI, Subtarget);
   case ISD::FDIV:
     return performFDivCombine(N, DAG, Subtarget);
   case ISD::OR:
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 7e59710..d4784b5 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class GCNTargetMachine;
 struct MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
@@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cfe6346..c9c95c7 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
   emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
                         RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
   if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
-                          RuntimeMD::OpenCL_C, 1);
-    auto Node = MD->getOperand(0);
-    unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                             ->getZExtValue();
-    unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                             ->getZExtValue();
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
-                          Major * 100 + Minor * 10, 2);
+    if (MD->getNumOperands()) {
+      auto Node = MD->getOperand(0);
+      if (Node->getNumOperands() > 1) {
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+                              RuntimeMD::OpenCL_C, 1);
+        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                         ->getZExtValue();
+        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                         ->getZExtValue();
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+                              Major * 100 + Minor * 10, 2);
+      }
+    }
   }
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3b41577..b955e23 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,15 +32,28 @@ using namespace llvm;
 namespace {
 
 class AMDGPUCodeGenPrepare : public FunctionPass,
-                             public InstVisitor<AMDGPUCodeGenPrepare> {
+                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+  const GCNTargetMachine *TM;
+  const SISubtarget *ST;
   DivergenceAnalysis *DA;
-  const TargetMachine *TM;
+  Module *Mod;
+  bool HasUnsafeFPMath;
 
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
     FunctionPass(ID),
-    TM(TM) { }
+    TM(static_cast<const GCNTargetMachine *>(TM)),
+    ST(nullptr),
+    DA(nullptr),
+    Mod(nullptr),
+    HasUnsafeFPMath(false) { }
+
+  bool visitFDiv(BinaryOperator &I);
+
+  bool visitInstruction(Instruction &I) {
+    return false;
+  }
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -55,7 +70,92 @@ public:
 
 } // End anonymous namespace
 
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+  if (!CNum)
+    return false;
+
+  // Reciprocal f32 is handled separately without denormals.
+  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+  Type *Ty = FDiv.getType();
+
+  // TODO: Handle half
+  if (!Ty->getScalarType()->isFloatTy())
+    return false;
+
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath)
+    return false;
+
+  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+  float ULP = FPOp->getFPAccuracy();
+  if (ULP < 2.5f)
+    return false;
+
+  FastMathFlags FMF = FPOp->getFastMathFlags();
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+                                      FMF.allowReciprocal();
+  if (ST->hasFP32Denormals() && !UnsafeDiv)
+    return false;
+
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+  Function *Decl
+    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  Value *NewFDiv = nullptr;
+
+  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    NewFDiv = UndefValue::get(VT);
+
+    // FIXME: Doesn't do the right thing for cases where the vector is partially
+    // constant. This works when the scalarizer pass is run first.
+    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+      Value *NumEltI = Builder.CreateExtractElement(Num, I);
+      Value *DenEltI = Builder.CreateExtractElement(Den, I);
+      Value *NewElt;
+
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+      } else {
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      }
+
+      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+    }
+  } else {
+    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  }
+
+  if (NewFDiv) {
+    FDiv.replaceAllUsesWith(NewFDiv);
+    NewFDiv->takeName(&FDiv);
+    FDiv.eraseFromParent();
+  }
+
+  return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
   return false;
 }
 
@@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   if (!TM || skipFunction(F))
     return false;
 
+  ST = &TM->getSubtarget<SISubtarget>(F);
   DA = &getAnalysis<DivergenceAnalysis>();
-  visit(F);
+  HasUnsafeFPMath = hasUnsafeFPMath(F);
 
-  return true;
+  bool MadeChange = false;
+
+  for (BasicBlock &BB : F) {
+    BasicBlock::iterator Next;
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+      Next = std::next(I);
+      MadeChange |= visit(*I);
+    }
+  }
+
+  return MadeChange;
 }
 
 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 
 char AMDGPUCodeGenPrepare::ID = 0;
 
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
   return new AMDGPUCodeGenPrepare(TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 6761b4b..3944fdb 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
-int FP32_NEG_ONE = 0xbf800000;
 int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
 }
 def CONST : Constants;
 
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 791872a..8e3471b 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  if (IntrID < Intrinsic::num_intrinsics) {
-    return nullptr;
-  }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+                                       ArrayRef<Type *> Tys) const {
+  if (IntrID < Intrinsic::num_intrinsics)
+    return StringRef();
+
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
-  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
+  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned NumTys) const {
+  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+                                           ArrayRef<Type*> Tys) const {
+  // FIXME: Re-use Intrinsic::getType machinery
+  switch (ID) {
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    Type *F32Ty = Type::getFloatTy(Context);
+    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+  }
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
 }
 
 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              ArrayRef<Type *> Tys) const {
+  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+  Function *F
+    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+  AttributeSet AS = getAttributes(M->getContext(),
+                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  F->setAttributes(AS);
+  return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
                                               Type **Tys,
-                                              unsigned numTys) const {
-  llvm_unreachable("Not implemented");
+                                              unsigned NumTys) const {
+  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
 }
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index f417392..6cb8b96 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -34,13 +34,23 @@ enum ID {
 class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo();
+
+  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
-                      unsigned numTys = 0) const override;
+                      unsigned NumTys = 0) const override;
+
   unsigned lookupName(const char *Name, unsigned Len) const override;
   bool isOverloaded(unsigned IID) const override;
   Function *getDeclaration(Module *M, unsigned ID,
                            Type **Tys = nullptr,
-                           unsigned numTys = 0) const override;
+                           unsigned NumTys = 0) const override;
+
+  Function *getDeclaration(Module *M, unsigned ID,
+                           ArrayRef<Type *> = None) const;
+
+  FunctionType *getType(LLVMContext &Context, unsigned ID,
+                        ArrayRef<Type*> Tys = None) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 7754638..0bad63f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) {
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  if (isa<AllocaInst>(Ptr))
-    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 
   auto I = GEPIdx.find(GEP);
@@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr,
 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
   // FIXME we only support simple cases
   if (GEP->getNumOperands() != 3)
-    return NULL;
+    return nullptr;
 
   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
   if (!I0 || !I0->isZero())
-    return NULL;
+    return nullptr;
 
   return GEP->getOperand(2);
 }
@@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   // are just being conservative for now.
   if (!AllocaTy ||
       AllocaTy->getElementType()->isVectorTy() ||
-      AllocaTy->getNumElements() > 4) {
+      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() < 2) {
     DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
@@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = Inst->getOperand(0);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
       Inst->replaceAllUsesWith(ExtractElement);
@@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
       Value *Ptr = Inst->getOperand(1);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
                                                        Inst->getOperand(0),
@@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
 
     default:
-      Inst->dump();
       llvm_unreachable("Inconsistency in instructions promotable to vector");
     }
   }
@@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 }
 
 static bool isCallPromotable(CallInst *CI) {
-  // TODO: We might be able to handle some cases where the callee is a
-  // constantexpr bitcast of a function.
-  if (!CI->getCalledFunction())
-    return false;
-
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (!II)
     return false;
@@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       continue;
     }
 
-    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
-    if (!Intr) {
-      // FIXME: What is this for? It doesn't make sense to promote arbitrary
-      // function calls. If the call is to a defined function that can also be
-      // promoted, we should be able to do this once that function is also
-      // rewritten.
-
-      std::vector<Type*> ArgTypes;
-      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
-                                ArgIdx != ArgEnd; ++ArgIdx) {
-        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
-      }
-      Function *F = Call->getCalledFunction();
-      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
-                                                F->isVarArg());
-      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
-                                             NewType, F->getAttributes());
-      Function *NewF = cast<Function>(C);
-      Call->setCalledFunction(NewF);
-      continue;
-    }
-
+    IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
     Builder.SetInsertPoint(Intr);
     switch (Intr->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3e53f52..b2d4e11 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -309,6 +309,7 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
+  void addIRPasses() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
   bool addInstSelector() override;
@@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&DeadMachineInstructionElimID);
 }
 
+void GCNPassConfig::addIRPasses() {
+  // TODO: May want to move later or split into an early and late one.
+  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+  AMDGPUPassConfig::addIRPasses();
+}
+
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8f78edd..8ccd176 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 
@@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::FP_TO_UINT:
     if (N->getValueType(0) == MVT::i1) {
-      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
       return;
     }
     // Fall-through. Since we don't care about out of bounds values
     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
     // considers some extra cases which are not necessary here.
   case ISD::FP_TO_SINT: {
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+      return;
+    }
+
     SDValue Result;
     if (expandFP_TO_SINT(N, Result, DAG))
       Results.push_back(Result);
@@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
 }
 
-SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(
       ISD::SETCC,
       DL,
       MVT::i1,
-      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
-      DAG.getCondCode(ISD::SETNE)
-      );
+      Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 2fb6ee2..9700ce1 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -72,7 +72,8 @@ private:
 
   SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 54efdc0..f4b04e3 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -41,7 +41,8 @@ enum {
   WQM = 1 << 22,
   VGPRSpill = 1 << 23,
   VOPAsmPrefer32Bit = 1 << 24,
-  Gather4 = 1 << 25
+  Gather4 = 1 << 25,
+  DisableWQM = 1 << 26
 };
 }
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 51241cf..80d4435 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MachineFunction *MF = BB->getParent();
     SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     DebugLoc DL = MI.getDebugLoc();
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
-        .addOperand(MI.getOperand(0))
-        .addImm(MFI->LDSSize);
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+      .addOperand(MI.getOperand(0))
+      .addImm(MFI->LDSSize);
     MI.eraseFromParent();
     return BB;
   }
@@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    return lowerFDIV_FAST(Op, DAG);
+  }
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
@@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
 // Catch division cases where we can use shortcuts with rcp and rsq
 // instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+                                              SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
-    return FastLowered;
-
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
 
-  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
-  if (EnableAMDGPUFastFDIV) {
-    // This does not support denormals.
-    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
 
-    const APFloat K0Val(BitsToFloat(0x6f800000));
-    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
 
-    const APFloat K1Val(BitsToFloat(0x2f800000));
-    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
 
-    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
-    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
 
-    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
 
-    // TODO: Should this propagate fast-math-flags?
+  // TODO: Should this propagate fast-math-flags?
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  // rcp does not support denormals.
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
-    // rcp does not support denormals.
-    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
 
-    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
 
-    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
-  }
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
 
-  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getTarget().Options.UnsafeFPMath)
-    return LowerFastFDIV(Op, DAG);
+    return lowerFastUnsafeFDIV(Op, DAG);
 
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e055ee..1d349fa 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 2f63d4e..6163f05 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bits<1> DS = 0;
   field bits<1> MIMG = 0;
   field bits<1> FLAT = 0;
+
+  // Whether WQM _must_ be enabled for this instruction.
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
 
@@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   field bits<1> Gather4 = 0;
 
+  // Whether WQM _must_ be disabled for this instruction.
+  field bits<1> DisableWQM = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{23} = VGPRSpill;
   let TSFlags{24} = VOPAsmPrefer32Bit;
   let TSFlags{25} = Gather4;
+  let TSFlags{26} = DisableWQM;
 
   let SchedRW = [Write32Bit];
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index d171e21..5cc6a4e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -738,7 +738,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
     MachineBasicBlock::iterator Insert = Entry.front();
     DebugLoc DL = Insert->getDebugLoc();
 
-    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+                                   *MF);
     if (TIDReg == AMDGPU::NoRegister)
       return TIDReg;
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 227b817..fef8904 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -340,6 +340,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
+  static bool isDisableWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  bool isDisableWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+  }
+
   static bool isVGPRSpill(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 253cc32..00f53e8 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
   def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
            MUBUFAddr64Table <0>;
 
+  let DisableWQM = 1 in {
+    def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+  }
+
   let addr64 = 0, isCodeGenOnly = 0 in {
     def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
   }
@@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+      DisableWQM = 1 in {
 
     // No return variants
     let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
 }
 
 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
 }
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 6427db8..18b7d5d 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+    (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
                                     (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
@@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _BOTHEN)
+    (!cast<MUBUF>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
@@ -3391,6 +3391,16 @@ def : Pat <
     (V_CNDMASK_B32_e64 0, -1, $src), sub1)
 >;
 
+class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+  (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+
 // If we need to perform a logical operation on i1 values, we need to
 // use vector comparisons since there is only one SCC register. Vector
 // comparisions still write to a pair of SGPRs, so treat these as
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index a9b7c39..9d06ccf 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
 //
 //===----------------------------------------------------------------------===//
 
@@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
 } // End TargetPrefix = "SI", isTarget = 1
 
 let TargetPrefix = "amdgcn", isTarget = 1 in {
+  // Emit 2.5 ulp, no denormal division. Should only be inserted by
+  // pass based on !fpmath metadata.
+  def int_amdgcn_fdiv_fast : Intrinsic<
+    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+  >;
+
   /* Control flow Intrinsics */
 
   def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4d12a1e..848be32 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
   Spill.Lane = Lane;
 
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+                                                *MF);
 
     if (LaneVGPR == AMDGPU::NoRegister)
       // We have no VGPRs left for spilling SGPRs.
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 0dd88ee..347c33f 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 /// \brief Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                           const TargetRegisterClass *RC) const {
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                   const TargetRegisterClass *RC,
+                                   const MachineFunction &MF) const {
+
   for (unsigned Reg : *RC)
-    if (!MRI.isPhysRegUsed(Reg))
+    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
       return Reg;
   return AMDGPU::NoRegister;
 }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 6e97b1b..d8b2d9f 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -185,7 +185,8 @@ public:
   unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
 
   unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass *RC) const;
+                              const TargetRegisterClass *RC,
+                              const MachineFunction &MF) const;
 
   unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
   unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index c1a237e..b200c15 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -94,12 +94,15 @@ private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<const MachineInstr *, 2> ExecExports;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -126,6 +129,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -135,8 +139,11 @@ public:
 
 char SIWholeQuadMode::ID = 0;
 
-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
-                "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)
 
 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 
@@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
   return new SIWholeQuadMode;
 }
 
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  // Ignore if the instruction is already marked. The typical case is that we
+  // mark an instruction WQM multiple times, but for atomics it can happen that
+  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+  // letting the atomic run in StateExact is correct as per the relevant specs.
+  if (II.Needs)
+    return;
+
+  II.Needs = Flag;
+  Worklist.push_back(&MI);
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
       if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
         Flags = StateWQM;
-      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+      } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
         // Handle export instructions with the exec mask valid flag set
@@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           continue;
       }
 
-      Instructions[&MI].Needs = Flags;
-      Worklist.push_back(&MI);
+      markInstruction(MI, Flags, Worklist);
       GlobalFlags |= Flags;
     }
 
@@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
   BlockInfo &BI = Blocks[MBB];
 
-  // Control flow-type instructions that are followed by WQM computations
-  // must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+  // Control flow-type instructions and stores to temporary memory that are
+  // followed by WQM computations must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
     Instructions[&MI].Needs = StateWQM;
     II.Needs = StateWQM;
   }
@@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
     if (!Use.isReg() || !Use.isUse())
       continue;
 
-    // At this point, physical registers appear as inputs or outputs
-    // and following them makes no sense (and would in fact be incorrect
-    // when the same VGPR is used as both an output and an input that leads
-    // to a NeedsWQM instruction).
-    //
-    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
-    // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
-      continue;
-
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
-      InstrInfo &DefII = Instructions[&DefMI];
+    unsigned Reg = Use.getReg();
 
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
         continue;
 
-      DefII.Needs = StateWQM;
-      Worklist.push_back(&DefMI);
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
     }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
   }
 }
 
@@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
 
   char GlobalFlags = analyzeFunction(MF);
   if (!(GlobalFlags & StateWQM)) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index d6e7caf..3cfcb1e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // Try to convert two saturating conditional selects into a single SSAT
   SDValue SatValue;
   uint64_t SatConstant;
-  if (isSaturatingConditional(Op, SatValue, SatConstant))
+  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+      isSaturatingConditional(Op, SatValue, SatConstant))
     return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
                        DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 060376b..c9735f3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 def SSAT : AI<(outs GPRnopc:$Rd),
               (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd),
 
 def SSAT16 : AI<(outs GPRnopc:$Rd),
                 (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd),
 
 def USAT : AI<(outs GPRnopc:$Rd),
               (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd),
 
 def USAT16 : AI<(outs GPRnopc:$Rd),
                 (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 55e5308..fe699b2 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2240,7 +2240,8 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 def t2SSAT: T2SatI<
               (outs rGPR:$Rd),
               (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsThumb2]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI<
 def t2SSAT16: T2SatI<
                 (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
                 "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI<
 def t2USAT: T2SatI<
                (outs rGPR:$Rd),
                (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+                Requires<[IsThumb2]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
@@ -2275,7 +2277,7 @@ def t2USAT: T2SatI<
 def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                      NoItinerary,
                      "usat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                     Requires<[IsThumb2, HasDSP]> {
   let Inst{31-22} = 0b1111001110;
   let Inst{20} = 0;
   let Inst{15} = 0;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cdad7ce..20c5f36 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
       return true;
     return false;
 
+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MICROMIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+  case ELF::R_MICROMIPS_GOT_OFST:
   case ELF::R_MIPS_16:
   case ELF::R_MIPS_32:
   case ELF::R_MIPS_GPREL32:
@@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_SHIFT5:
   case ELF::R_MIPS_SHIFT6:
   case ELF::R_MIPS_GOT_DISP:
-  case ELF::R_MIPS_GOT_PAGE:
-  case ELF::R_MIPS_GOT_OFST:
   case ELF::R_MIPS_GOT_HI16:
   case ELF::R_MIPS_GOT_LO16:
   case ELF::R_MIPS_INSERT_A:
@@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MICROMIPS_PC16_S1:
   case ELF::R_MICROMIPS_CALL16:
   case ELF::R_MICROMIPS_GOT_DISP:
-  case ELF::R_MICROMIPS_GOT_PAGE:
-  case ELF::R_MICROMIPS_GOT_OFST:
   case ELF::R_MICROMIPS_GOT_HI16:
   case ELF::R_MICROMIPS_GOT_LO16:
   case ELF::R_MICROMIPS_SUB:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 1622b22..1ce8f07 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
 
+  // FIXME: This condition isn't quite right but it's the best we can do until
+  //        this object can identify the ABI. It will misbehave when using O32
+  //        on a mips64*-* triple.
+  if ((TheTriple.getArch() == Triple::mipsel) ||
+      (TheTriple.getArch() == Triple::mips)) {
+    PrivateGlobalPrefix = "$";
+    PrivateLabelPrefix = "$";
+  }
+
   AlignmentIsInBytes          = false;
   Data16bitsDirective         = "\t.2byte\t";
   Data32bitsDirective         = "\t.4byte\t";
   Data64bitsDirective         = "\t.8byte\t";
-  PrivateGlobalPrefix         = "$";
-  PrivateLabelPrefix          = "$";
   CommentString               = "#";
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c248c3a..80641ed 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   else
     Ret += "E";
 
-  Ret += "-m:m";
+  if (ABI.IsO32())
+    Ret += "-m:m";
+  else
+    Ret += "-m:e";
 
   // Pointers are 32 bit on some ABIs.
   if (!ABI.IsN64())
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e547111..2c54838 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
@@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
@@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                          DAG.getUNDEF(SrcVT)));
     }
     if (SrcVT.getVectorElementType() == MVT::i1) {
+      if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                           DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
@@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
   MVT SVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (SVT.getVectorElementType() == MVT::i1) {
+    if (SVT == MVT::v2i1)
+      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+    MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+  }
+
   switch (SVT.SimpleTy) {
   default:
     llvm_unreachable("Custom UINT_TO_FP is not supported!");
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 1672b38..5f0aab9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, unsigned &NewSrc,
                                   bool &isKill, bool &isUndef,
-                                  MachineOperand &ImplicitOp) const {
+                                  MachineOperand &ImplicitOp,
+                                  LiveVariables *LV) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
@@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                 get(TargetOpcode::COPY))
         .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
         .addOperand(Src);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
     isUndef = false;
+
+    if (LV)
+      LV->replaceKillInstruction(SrcReg, MI, *Copy);
   }
 
   // We've set all the parameters without issue.
@@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     const MachineOperand &Src2 = MI.getOperand(2);
@@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                        SrcReg2, isKill2, isUndef2, ImplicitOp2))
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 858f35d..a8a9f62 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -230,7 +230,7 @@ public:
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
                       bool &isKill, bool &isUndef,
-                      MachineOperand &ImplicitOp) const;
+                      MachineOperand &ImplicitOp, LiveVariables *LV) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 9a98f5c..f91764a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
                        Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -2009,24 +2009,35 @@ def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (loadv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (loadv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index fff5440..787f434 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -332,6 +332,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
 namespace llvm {
 template <> struct GraphTraits<ArgumentGraphNode *> {
   typedef ArgumentGraphNode NodeType;
+  typedef ArgumentGraphNode *NodeRef;
   typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
 
   static inline NodeType *getEntryNode(NodeType *A) { return A; }
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 310c292..99b12d4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -779,7 +780,8 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
         // Instructions could multiply use V.
         while (UI != E && *UI == I)
           ++UI;
-        I->eraseFromParent();
+        if (isInstructionTriviallyDead(I, TLI))
+          I->eraseFromParent();
       }
 }
 
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index cf5b76d..df6a48e 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -134,6 +134,10 @@ static cl::opt<int> PreInlineThreshold(
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
+static cl::opt<bool> EnableGVNHoist(
+    "enable-gvn-hoist", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental GVN Hoisting pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -232,7 +236,8 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createCFGSimplificationPass());
   FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
-  FPM.add(createGVNHoistPass());
+  if(EnableGVNHoist)
+    FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index d7eed79..8f1ff8a 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -553,8 +553,11 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+  // decomposeBitTestICmp() might help.
   {
-    unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType());
+    unsigned BitWidth =
+        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
     APInt MinSignedValue = APInt::getSignBit(BitWidth);
     Value *X;
     const APInt *Y, *C;
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 51c3262..377ccb9 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2830,7 +2830,8 @@ bool InstCombiner::run() {
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
@@ -2851,7 +2852,8 @@ bool InstCombiner::run() {
         // Add operands to the worklist.
         replaceInstUsesWith(*I, C);
         ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
         MadeIRChange = true;
         continue;
       }
@@ -3007,7 +3009,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
-          Inst->eraseFromParent();
+          if (isInstructionTriviallyDead(Inst, TLI))
+            Inst->eraseFromParent();
           continue;
         }
 
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index dcb62d3..41041c7 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -272,8 +272,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
         return false;
     }
 
-    // Check if the global is in a GCOV counter array.
-    if (GV->getName().startswith("__llvm_gcov_ctr"))
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
       return false;
   }
 
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 88172d1..9e98219 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -19,6 +19,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constant.h"
@@ -90,11 +91,13 @@ bool ConstantPropagation::runOnFunction(Function &F) {
 
         // Remove the dead instruction.
         WorkList.erase(I);
-        I->eraseFromParent();
+        if (isInstructionTriviallyDead(I, TLI)) {
+          I->eraseFromParent();
+          ++NumInstKilled;
+        }
 
         // We made a change to the function...
         Changed = true;
-        ++NumInstKilled;
       }
   }
   return Changed;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 9d0ef42..0b16e27 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -582,6 +582,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
+      bool Killed = false;
       if (!Inst->use_empty()) {
         Inst->replaceAllUsesWith(V);
         Changed = true;
@@ -589,11 +590,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       if (isInstructionTriviallyDead(Inst, &TLI)) {
         Inst->eraseFromParent();
         Changed = true;
+        Killed = true;
       }
-      if (Changed) {
+      if (Changed)
         ++NumSimplify;
+      if (Killed)
         continue;
-      }
     }
 
     // If this is a simple instruction that we can value number, process it.
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 542cf38..e958563 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -815,6 +815,14 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
   if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
     return;
 
+  // Check that `Cast` actually extends the induction variable (we rely on this
+  // later).  This takes care of cases where `Cast` is extending a truncation of
+  // the narrow induction variable, and thus can end up being narrower than the
+  // "narrow" induction variable.
+  uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+  if (NarrowIVWidth >= Width)
+    return;
+
   // Cast is either an sext or zext up to this point.
   // We should not widen an indvar if arithmetics on the wider indvar are more
   // expensive than those on the narrower indvar. We check only the cost of ADD
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b9e717c..d1769fc 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -758,7 +758,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
-      I->eraseFromParent();
+      if (isInstructionTriviallyDead(I, TLI))
+        I->eraseFromParent();
       Condition = SimpleVal;
     }
   }
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 2c0a70e..cdd17fc 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -377,9 +377,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
               &I, I.getModule()->getDataLayout(), TLI)) {
         DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
         CurAST->copyValue(&I, C);
-        CurAST->deleteValue(&I);
         I.replaceAllUsesWith(C);
-        I.eraseFromParent();
+        if (isInstructionTriviallyDead(&I, TLI)) {
+          CurAST->deleteValue(&I);
+          I.eraseFromParent();
+        }
         continue;
       }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 77c77eb..70bd9d3 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -4442,6 +4442,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+  Rewriter.setInsertPoint(&*IP);
 
   // Inform the Rewriter if we have a post-increment use, so that it can
   // perform an advantageous expansion.
@@ -4473,7 +4474,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                  LF.UserInst, LF.OperandValToReplace,
                                  Loops, SE, DT);
 
-    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
   }
 
   // Expand the ScaledReg portion.
@@ -4491,14 +4492,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Expand ScaleReg as if it was part of the base regs.
       if (F.Scale == 1)
         Ops.push_back(
-            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
       else {
         // An interesting way of "folding" with an icmp is to use a negated
         // scale, which we'll implement by inserting it into the other operand
         // of the icmp.
         assert(F.Scale == -1 &&
                "The only scale supported by ICmpZero uses is -1!");
-        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
       }
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
@@ -4508,11 +4509,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // Unless the addressing mode will not be folded.
       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
           isAMCompletelyFolded(TTI, LU, F)) {
-        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
-      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
       if (F.Scale != 1)
         ScaledS =
             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
@@ -4524,7 +4525,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   if (F.BaseGV) {
     // Flush the operand list to suppress SCEVExpander hoisting.
     if (!Ops.empty()) {
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
       Ops.clear();
       Ops.push_back(SE.getUnknown(FullV));
     }
@@ -4534,7 +4535,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
   // unfolded offsets. LSR assumes they both live next to their uses.
   if (!Ops.empty()) {
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
     Ops.clear();
     Ops.push_back(SE.getUnknown(FullV));
   }
@@ -4570,7 +4571,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   const SCEV *FullS = Ops.empty() ?
                       SE.getConstant(IntTy, 0) :
                       SE.getAddExpr(Ops);
-  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
 
   // We're done expanding now, so reset the rewriter.
   Rewriter.clearPostInc();
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index c5ca563..4f1052d 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -552,9 +553,39 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // two PHINodes, the iteration over the old PHIs remains valid, and the
   // mapping will just map us to the new node (which may not even be a PHI
   // node).
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  SmallSetVector<const Value *, 8> Worklist;
   for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
-    if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
-      recursivelySimplifyInstruction(PN);
+    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+      Worklist.insert(PHIToResolve[Idx]);
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    const Value *OrigV = Worklist[Idx];
+    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+    if (!I)
+      continue;
+
+    // See if this instruction simplifies.
+    Value *SimpleV = SimplifyInstruction(I, DL);
+    if (!SimpleV)
+      continue;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (const User *U : OrigV->users())
+      Worklist.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // If the original instruction had no side effects, remove it.
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+    else
+      VMap[OrigV] = I;
+  }
 
   // Now that the inlined function body has been fully constructed, go through
   // and zap unconditional fall-through branches. This happens all the time when
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 1fbb19d..e82c07f 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1294,6 +1294,13 @@ updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
   return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
 }
 
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
 /// Update inlined instructions' line numbers to
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
@@ -1328,7 +1335,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 
         // Don't update static allocas, as they may get moved later.
         if (auto *AI = dyn_cast<AllocaInst>(BI))
-          if (isa<Constant>(AI->getArraySize()))
+          if (allocaWouldBeStaticInEntry(AI))
             continue;
 
         BI->setDebugLoc(TheCallDL);
@@ -1626,7 +1633,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         continue;
       }
 
-      if (!isa<Constant>(AI->getArraySize()))
+      if (!allocaWouldBeStaticInEntry(AI))
         continue;
       
       // Keep track of the static allocas that we inline into the caller.
@@ -1635,7 +1642,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Scan for the block of allocas that we can move over, and move them
       // all at once.
       while (isa<AllocaInst>(I) &&
-             isa<Constant>(cast<AllocaInst>(I)->getArraySize())) {
+             allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
         IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
         ++I;
       }
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 9658966..0d5a25b 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -64,6 +64,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
                                     DominatorTree &DT, LoopInfo &LI) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallSetVector<PHINode *, 16> PHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
@@ -115,7 +116,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     SmallVector<PHINode *, 16> AddedPHIs;
     SmallVector<PHINode *, 8> PostProcessPHIs;
 
-    SSAUpdater SSAUpdate;
+    SmallVector<PHINode *, 4> InsertedPHIs;
+    SSAUpdater SSAUpdate(&InsertedPHIs);
     SSAUpdate.Initialize(I->getType(), I->getName());
 
     // Insert the LCSSA phi's into all of the exit blocks dominated by the
@@ -184,6 +186,14 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
       // Otherwise, do full PHI insertion.
       SSAUpdate.RewriteUse(*UseToRewrite);
+
+      // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+      // to post-process them to keep LCSSA form.
+      for (PHINode *InsertedPN : InsertedPHIs) {
+        if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+          if (!L->contains(OtherLoop))
+            PostProcessPHIs.push_back(InsertedPN);
+      }
     }
 
     // Post process PHI instructions that were inserted into another disjoint
@@ -196,13 +206,19 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       Worklist.push_back(PostProcessPN);
     }
 
-    // Remove PHI nodes that did not have any uses rewritten.
+    // Keep track of PHI nodes that we want to remove because they did not have
+    // any uses rewritten.
     for (PHINode *PN : AddedPHIs)
       if (PN->use_empty())
-        PN->eraseFromParent();
+        PHIsToRemove.insert(PN);
 
     Changed = true;
   }
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (PHINode *PN : PHIsToRemove) {
+    assert (PN->use_empty() && "Trying to remove a phi with uses.");
+    PN->eraseFromParent();
+  }
   return Changed;
 }
 
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index b3a928b..2846e8f 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -327,6 +327,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
     else
       NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
 
+  SmallVector<BasicBlock *, 8> OuterLoopBlocks;
+  OuterLoopBlocks.push_back(NewBB);
   // Now that we know which blocks are in L and which need to be moved to
   // OuterLoop, move any blocks that need it.
   for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
@@ -334,12 +336,53 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
     if (!BlocksInL.count(BB)) {
       // Move this block to the parent, updating the exit blocks sets
       L->removeBlockFromLoop(BB);
-      if ((*LI)[BB] == L)
+      if ((*LI)[BB] == L) {
         LI->changeLoopFor(BB, NewOuter);
+        OuterLoopBlocks.push_back(BB);
+      }
       --i;
     }
   }
 
+  // Split edges to exit blocks from the inner loop, if they emerged in the
+  // process of separating the outer one.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
+                                               ExitBlocks.end());
+  for (BasicBlock *ExitBlock : ExitBlockSet) {
+    if (any_of(predecessors(ExitBlock),
+               [L](BasicBlock *BB) { return !L->contains(BB); })) {
+      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
+    }
+  }
+
+  if (PreserveLCSSA) {
+    // Fix LCSSA form for L. Some values, which previously were only used inside
+    // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
+    // in corresponding exit blocks.
+
+    // Go through all instructions in OuterLoopBlocks and check if they are
+    // using operands from the inner loop. In this case we'll need to fix LCSSA
+    // for these instructions.
+    SmallSetVector<Instruction *, 8> WorklistSet;
+    for (BasicBlock *OuterBB: OuterLoopBlocks) {
+      for (Instruction &I : *OuterBB) {
+        for (Value *Op : I.operands()) {
+          Instruction *OpI = dyn_cast<Instruction>(Op);
+          if (!OpI || !L->contains(OpI))
+            continue;
+          WorklistSet.insert(OpI);
+        }
+      }
+    }
+    SmallVector<Instruction *, 8> Worklist(WorklistSet.begin(),
+                                           WorklistSet.end());
+    formLCSSAForInstructions(Worklist, *DT, *LI);
+    assert(NewOuter->isRecursivelyLCSSAForm(*DT) &&
+           "LCSSA is broken after separating nested loops!");
+  }
+
   return NewOuter;
 }
 
@@ -541,17 +584,12 @@ ReprocessLoop:
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
   for (BasicBlock *ExitBlock : ExitBlockSet) {
-    for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
-         PI != PE; ++PI)
-      // Must be exactly this loop: no subloops, parent loops, or non-loop preds
-      // allowed.
-      if (!L->contains(*PI)) {
-        if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) {
-          ++NumInserted;
-          Changed = true;
-        }
-        break;
-      }
+    if (any_of(predecessors(ExitBlock),
+               [L](BasicBlock *BB) { return !L->contains(BB); })) {
+      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
+      ++NumInserted;
+      Changed = true;
+    }
   }
 
   // If the header has more than two predecessors at this point (from the
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8b85e32..ee5733d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -220,6 +221,81 @@ class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class LoopVectorizationRequirements;
 
+// A traits type that is intended to be used in graph algorithms. The graph it
+// models starts at the loop header, and traverses the BasicBlocks that are in
+// the loop body, but not the loop header. Since the loop header is skipped,
+// the back edges are excluded.
+struct LoopBodyTraits {
+  using NodeRef = std::pair<const Loop *, BasicBlock *>;
+
+  // This wraps a const Loop * into the iterator, so we know which edges to
+  // filter out.
+  class WrappedSuccIterator
+      : public iterator_adaptor_base<
+            WrappedSuccIterator, succ_iterator,
+            typename std::iterator_traits<succ_iterator>::iterator_category,
+            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+    using BaseT = iterator_adaptor_base<
+        WrappedSuccIterator, succ_iterator,
+        typename std::iterator_traits<succ_iterator>::iterator_category,
+        NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>;
+
+    const Loop *L;
+
+  public:
+    WrappedSuccIterator(succ_iterator Begin, const Loop *L)
+        : BaseT(Begin), L(L) {}
+
+    NodeRef operator*() const { return {L, *I}; }
+  };
+
+  struct LoopBodyFilter {
+    bool operator()(NodeRef N) const {
+      const Loop *L = N.first;
+      return N.second != L->getHeader() && L->contains(N.second);
+    }
+  };
+
+  using ChildIteratorType =
+      filter_iterator<WrappedSuccIterator, LoopBodyFilter>;
+
+  static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; }
+
+  static ChildIteratorType child_begin(NodeRef Node) {
+    return make_filter_range(make_range<WrappedSuccIterator>(
+                                 {succ_begin(Node.second), Node.first},
+                                 {succ_end(Node.second), Node.first}),
+                             LoopBodyFilter{})
+        .begin();
+  }
+
+  static ChildIteratorType child_end(NodeRef Node) {
+    return make_filter_range(make_range<WrappedSuccIterator>(
+                                 {succ_begin(Node.second), Node.first},
+                                 {succ_end(Node.second), Node.first}),
+                             LoopBodyFilter{})
+        .end();
+  }
+};
+
+/// Returns true if the given loop body has a cycle, excluding the loop
+/// itself.
+static bool hasCyclesInLoopBody(const Loop &L) {
+  if (!L.empty())
+    return true;
+
+  for (const auto SCC :
+       make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
+                  scc_iterator<Loop, LoopBodyTraits>::end(L))) {
+    if (SCC.size() > 1) {
+      DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
+      DEBUG(L.dump());
+      return true;
+    }
+  }
+  return false;
+}
+
 /// \brief This modifies LoopAccessReport to initialize message with
 /// loop-vectorizer-specific part.
 class VectorizationReport : public LoopAccessReport {
@@ -1782,12 +1858,14 @@ private:
   Instruction *UnsafeAlgebraInst;
 };
 
-static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
-  if (L.empty())
-    return V.push_back(&L);
-
+static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+  if (L.empty()) {
+    if (!hasCyclesInLoopBody(L))
+      V.push_back(&L);
+    return;
+  }
   for (Loop *InnerL : L)
-    addInnerLoop(*InnerL, V);
+    addAcyclicInnerLoop(*InnerL, V);
 }
 
 /// The LoopVectorize Pass.
@@ -4395,6 +4473,9 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
+  // FIXME: The code is currently dead, since the loop gets sent to
+  // LoopVectorizationLegality is already an innermost loop.
+  //
   // We can only vectorize innermost loops.
   if (!TheLoop->empty()) {
     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
@@ -6639,7 +6720,7 @@ bool LoopVectorizePass::runImpl(
   SmallVector<Loop *, 8> Worklist;
 
   for (Loop *L : *LI)
-    addInnerLoop(*L, Worklist);
+    addAcyclicInnerLoop(*L, Worklist);
 
   LoopsAnalyzed += Worklist.size();
 
diff --git a/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
new file mode 100644
index 0000000..a71b5e8
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=aarch64-linux-eabi -o - | FileCheck %s
+
+%struct.a= type { i64, i64, i64, i64 }
+
+; DAG combine will try to perform a transformation that  creates a vcvtfp2fxs
+; with a v4f64 input. Since v4i64 is not legal we should bail out. We can
+; pottentially still create the vcvtfp2fxs node after legalization (but on a
+; v2f64).
+
+; CHECK-LABEL: fun1
+define void @fun1() local_unnamed_addr {
+entry:
+  %mul = fmul <4 x double> zeroinitializer, <double 6.553600e+04, double 6.553600e+04, double 6.553600e+04, double 6.553600e+04>
+  %toi = fptosi <4 x double> %mul to <4 x i64>
+  %ptr = getelementptr inbounds %struct.a, %struct.a* undef, i64 0, i32 2
+  %elem = extractelement <4 x i64> %toi, i32 1
+  store i64 %elem, i64* %ptr, align 8
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap()
+
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
index a12132f..d78c751 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -1,8 +1,246 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
-; RUN: opt -S -amdgpu-codegenprepare < %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
 ; Make sure this doesn't crash with no triple
 
-; CHECK-LABEL: @foo(
-define void @foo() {
+; NOOP-LABEL: @noop_fdiv_fpmath(
+; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
+define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: @fdiv_fpmath(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath(
+; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
+; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
+; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
+; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
+; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+  %no.md = fdiv float 1.0, %x
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float 1.0, %x, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp float 1.0, %x
+  store volatile float %arcp.no.md, float addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.25ulp, float addrspace(1)* %out
+
+  %fast.no.md = fdiv fast float 1.0, %x
+  store volatile float %fast.no.md, float addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
+  store volatile float %fast.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+
+; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
+; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
+define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+  %no.md = fdiv <2 x float> %a, %b
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+
+  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
+  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; FIXME: Should be able to get fdiv for 1.0 component
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
+
+  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
+; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+
+; CHECK: !0 = !{float 2.500000e+00}
+; CHECK: !1 = !{float 5.000000e-01}
+; CHECK: !2 = !{float 1.000000e+00}
+; CHECK: !3 = !{float 3.000000e+00}
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 7b51586..bd0817d 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -417,12 +417,6 @@ entry:
   ret void
 }
 
-; HSAOPT: !0 = !{}
-; HSAOPT: !1 = !{i32 0, i32 2048}
-
-; NOHSAOPT: !0 = !{i32 0, i32 2048}
-
-
 ; FUNC-LABEL: v16i32_stack:
 
 ; R600: MOVA_INT
@@ -527,4 +521,33 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   ret void
 }
 
+; OPT-LABEL: @direct_alloca_read_0xi32(
+; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
+; OPT: load [0 x i32], [0 x i32] addrspace(3)*
+define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [0 x i32]
+  store [0 x i32] [], [0 x i32]* %tmp
+  %load = load [0 x i32], [0 x i32]* %tmp
+  store [0 x i32] %load, [0 x i32] addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @direct_alloca_read_1xi32(
+; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
+; OPT: load [1 x i32], [1 x i32] addrspace(3)*
+define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [1 x i32]
+  store [1 x i32] [i32 0], [1 x i32]* %tmp
+  %load = load [1 x i32], [1 x i32]* %tmp
+  store [1 x i32] %load, [1 x i32] addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
+
+; HSAOPT: !0 = !{}
+; HSAOPT: !1 = !{i32 0, i32 2048}
+
+; NOHSAOPT: !0 = !{i32 0, i32 2048}
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index ff730a0..0063624 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -6,7 +6,6 @@
 ; GCN-LABEL: {{^}}test_branch:
 ; GCNNOOPT: v_writelane_b32
 ; GCNNOOPT: v_writelane_b32
-; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#1
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 4021233..65464cd 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -1,8 +1,4 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
@@ -15,22 +11,59 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
+; SI: v_div_scale_f32
+; SI-DAG: v_div_scale_f32
 
 ; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
+; SI: v_cndmask_b32
+; SI: v_mul_f32
+; SI: v_rcp_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; Use correct fdiv
+; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
 
-; I754-DAG: v_div_scale_f32
-; I754-DAG: v_rcp_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_mul_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_div_fixup_f32
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
-  %0 = fdiv float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -38,15 +71,14 @@ entry:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv fast float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -54,15 +86,14 @@ entry:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv arcp float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv arcp float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -72,26 +103,24 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+entry:
+  %fdiv = fdiv <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
+  ret void
+}
 
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
+; SI: v_cmp_gt_f32
+; SI: v_cmp_gt_f32
+define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -101,19 +130,12 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv fast <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv fast <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -123,19 +145,12 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv arcp <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -149,37 +164,11 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
+attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
+attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index be23e10..1537d67 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare double @llvm.fabs.f64(double) #1
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
@@ -54,3 +55,23 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
   store i64 %cast, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
+define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %conv = fptosi double %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
+define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %in.fabs = call double @llvm.fabs.f64(double %in)
+  %conv = fptosi double %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index b39aead..0cd0358 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
-declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fabs.f32(float) #1
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -17,7 +17,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
-  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -227,4 +227,26 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   ret void
 }
 
-attributes #0 = { nounwind readnone }
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
+
+; EG: AND_INT
+; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
+; EG-NEXT: -1082130432(-1.000000e+00)
+define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %conv = fptosi float %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
+define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %in.fabs = call float @llvm.fabs.f32(float %in)
+  %conv = fptosi float %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index 760019e..d5bc416 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
@@ -68,3 +69,23 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
+define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %conv = fptoui double %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
+define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %in.fabs = call double @llvm.fabs.f64(double %in)
+  %conv = fptoui double %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
index b7b6ccc..8a0f9fa 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -1,6 +1,8 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+
+declare float @llvm.fabs.f32(float) #1
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -215,3 +217,27 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
+
+; EG: AND_INT
+; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
+define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %conv = fptoui float %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
+define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %in.fabs = call float @llvm.fabs.f32(float %in)
+  %conv = fptoui float %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
new file mode 100644
index 0000000..4e17a92
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
new file mode 100644
index 0000000..35b7d70
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
new file mode 100644
index 0000000..e169355
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
new file mode 100644
index 0000000..54d7848
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.amdgcn.fdiv.fast(float, float) #0
+
+; CHECK-LABEL: {{^}}test_fdiv_fast:
+; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; CHECK: v_mul_f32_e32
+; CHECK: v_rcp_f32_e32
+; CHECK: v_mul_f32_e32
+; CHECK: v_mul_f32_e32
+define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+  %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
deleted file mode 100644
index cf6d1ab..0000000
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
-
-
-@lds0 = addrspace(3) global [512 x float] undef, align 4
-@lds1 = addrspace(3) global [256 x float] undef, align 4
-
-; FUNC-LABEL: {{^}}groupstaticsize_test0:
-; CHECK: s_movk_i32 s{{[0-9]+}}, 0x800
-define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %idx.0 = add nsw i32 %tid.x, 64
-  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
-  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  store float %val0, float addrspace(1)* %out, align 4
-
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}groupstaticsize_test1:
-; CHECK: s_movk_i32 s{{[0-9]+}}, 0xc00
-define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
-entry:
-  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
-  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %idx.0 = add nsw i32 %tid.x, 64
-  %tmp = icmp eq i32 %cond, 0
-  br i1 %tmp, label %if, label %else
-
-if:                                               ; preds = %entry
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  store float %val0, float addrspace(1)* %out, align 4
-  br label %endif
-
-else:                                             ; preds = %entry
-  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  store float %val1, float addrspace(1)* %out, align 4
-  br label %endif
-
-endif:                                            ; preds = %else, %if
-  ret void
-}
-
-
-declare i32 @llvm.amdgcn.groupstaticsize() #1
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
new file mode 100644
index 0000000..6014e2e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [256 x float] undef, align 4
+
+@large = addrspace(3) global [4096 x i32] undef, align 4
+
+; CHECK-LABEL: {{^}}groupstaticsize_test0:
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
+define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store float %val0, float addrspace(1)* %out, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: {{^}}groupstaticsize_test1:
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
+define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+entry:
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store float %val0, float addrspace(1)* %out, align 4
+  br label %endif
+
+else:                                             ; preds = %entry
+  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  store float %val1, float addrspace(1)* %out, align 4
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+; Exceeds 16-bit simm limit of s_movk_i32
+; CHECK-LABEL: {{^}}large_groupstaticsize:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+  %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
+  store i32 %static_lds_size, i32 addrspace(1)* %size
+  ret void
+}
+
+declare i32 @llvm.amdgcn.groupstaticsize() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
index b1d4220..27a88f7 100644
--- a/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -1,11 +1,96 @@
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; FIXME: Evergreen only ever does unsafe fp math.
 ; FUNC-LABEL: {{^}}rcp_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv fast float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv arcp float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fabs_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]|
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %rcp = fdiv float 1.0, %src.fabs
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: fneg folded into constant 1
+; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32:
+define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %src.fabs.fneg = fsub float -0.0, %src.fabs
+  %rcp = fdiv float 1.0, %src.fabs.fneg
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
deleted file mode 100644
index f9292a7..0000000
--- a/test/CodeGen/AMDGPU/reciprocal.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define amdgpu_ps void @test(<4 x float> inreg %reg0) {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = fdiv float 1.0, %r0
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index 10187f6..4ba4ac7 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -348,7 +348,6 @@ bb7:                                              ; preds = %bb4
 ; CHECK: image_sample_c
 
 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
-; CHECK: s_and_b64 exec, exec,
 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
@@ -385,6 +384,7 @@ bb9:                                              ; preds = %bb4
 
 declare void @llvm.AMDGPU.kill(float) #0
 declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index c151ca9..7dcf36f 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -3,6 +3,11 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+
+; OPT-LABEL: @vector_read(
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
 
 ; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
@@ -12,21 +17,26 @@
 ; EG: MOVA_INT
 define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 1, i32* %y
   store i32 2, i32* %z
   store i32 3, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
-  %2 = load i32, i32* %1
-  store i32 %2, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
   ret void
 }
 
+; OPT-LABEL: @vector_write(
+; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
+; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
+; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
+
 ; FUNC-LABEL: {{^}}vector_write:
 ; EG: MOV
 ; EG: MOV
@@ -36,42 +46,95 @@ entry:
 ; EG: MOVA_INT
 define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
-  store i32 1, i32* %1
-  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
-  %3 = load i32, i32* %2
-  store i32 %3, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index
+  store i32 1, i32* %tmp1
+  %tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index
+  %tmp3 = load i32, i32* %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %out
   ret void
 }
 
 ; This test should be optimize to:
 ; store i32 0, i32 addrspace(1)* %out
+
+; OPT-LABEL: @bitcast_gep(
+; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
+
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %2 = bitcast i32* %1 to [4 x i32]*
-  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
-  %4 = load i32, i32* %3
-  store i32 %4, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %tmp2 = bitcast i32* %tmp1 to [4 x i32]*
+  %tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0
+  %tmp4 = load i32, i32* %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_read_bitcast_gep(
+; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
+  %bc = bitcast i32* %x to float*
+  store float 1.0, float* %bc
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be able to promote this. Instcombine should fold the
+; cast in the hasOneUse case so it might not matter in practice
+
+; OPT-LABEL: @vector_read_bitcast_alloca(
+; OPT: alloca [4 x float]
+; OPT: store float
+; OPT: store float
+; OPT: store float
+; OPT: store float
+; OPT: load float
+define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
+  %x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0
+  %y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1
+  %z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2
+  %w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3
+  store float 0.0, float* %x
+  store float 1.0, float* %y
+  store float 2.0, float* %z
+  store float 4.0, float* %w
+  %tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index
+  %tmp2 = load float, float* %tmp1
+  store float %tmp2, float addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 23b0ffd..809a7ba 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -41,14 +41,14 @@ main_body:
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
-  %wr = extractelement <4 x float> %tex, i32 1
-  store float %wr, float addrspace(1)* %gep
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
+
   ret <4 x float> %tex
 }
 
@@ -66,8 +66,9 @@ main_body:
 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
 main_body:
   %c.1 = mul i32 %c, %d
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
-  store float %data, float addrspace(1)* %gep
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
+
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   ret <4 x float> %tex
 }
@@ -89,7 +90,7 @@ main_body:
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK: image_sample
-define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
@@ -100,8 +101,7 @@ IF:
   br label %END
 
 ELSE:
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -129,7 +129,7 @@ END:
 ;CHECK: s_or_b64 exec, exec,
 ;CHECK: v_mov_b32_e32 v0
 ;CHECK: ; return
-define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %ELSE, label %IF
@@ -140,8 +140,7 @@ IF:
   br label %END
 
 ELSE:
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -163,23 +162,20 @@ END:
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmp
-define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
 main_body:
   %idx.1 = extractelement <3 x i32> %idx, i32 0
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 0
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   ; The load that determines the branch (and should therefore be WQM) is
   ; surrounded by stores that require disabled WQM.
   %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
-  %z = load float, float addrspace(1)* %gep.2
+  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
 
   %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
   %data.3 = extractelement <2 x float> %data, i32 1
-  store float %data.3, float addrspace(1)* %gep.3
+  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %z, 0.0
   br i1 %cc, label %IF, label %ELSE
@@ -210,24 +206,21 @@ END:
 ;CHECK: load
 ;CHECK: store
 ;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = extractelement <4 x float> %tex, i32 0
 
   %idx.1 = extractelement <3 x i32> %idx, i32 0
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 0
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
-  %z = load float, float addrspace(1)* %gep.2
+  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
 
   %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
   %data.3 = extractelement <2 x float> %data, i32 1
-  store float %data.3, float addrspace(1)* %gep.3
+  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %z, 0.0
   br i1 %cc, label %IF, label %ELSE
@@ -258,15 +251,14 @@ END:
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: %END
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
 
 IF:
-  %data = load float, float addrspace(1)* %ptr
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
-  store float %data, float addrspace(1)* %gep
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -282,13 +274,11 @@ END:
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmpx_
 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
@@ -296,16 +286,14 @@ main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
   %idx.0 = extractelement <2 x i32> %idx, i32 0
-  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
   %data.0 = extractelement <2 x float> %data, i32 0
-  store float %data.0, float addrspace(1)* %gep.0
+  call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
 
   call void @llvm.AMDGPU.kill(float %z)
 
   %idx.1 = extractelement <2 x i32> %idx, i32 1
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 1
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %out = fadd <4 x float> %tex, %tex2
@@ -321,16 +309,14 @@ main_body:
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: image_sample
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
-; SI: buffer_store_dword
-; VI: flat_store_dword
+; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
   call void @llvm.AMDGPU.kill(float %z)
 
@@ -350,9 +336,91 @@ main_body:
   ret float %s
 }
 
+; CHECK-LABEL: {{^}}test_loop_vcc:
+; CHECK-NEXT: ; %entry
+; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: image_store
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
+; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[LOOPHDR]]: ; %loop
+; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
+; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
+; CHECK: s_cbranch_vccz
+; CHECK: ; %break
+
+; CHECK: ; return
+define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
+entry:
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  br label %loop
+
+loop:
+  %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
+  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
+  %cc = icmp sgt i32 %ctr.iv, 7
+  br i1 %cc, label %break, label %body
+
+body:
+  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
+  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %ctr.next = add i32 %ctr.iv, 2
+  br label %loop
+
+break:
+  ret <4 x float> %c.iv
+}
+
+; Only intrinsic stores need exact execution -- other stores do not have
+; externally visible effects and may require WQM for correctness.
+;
+; CHECK-LABEL: {{^}}test_alloca:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dwordx4
+define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
+entry:
+  %array = alloca [32 x i32], align 4
+
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+  %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
+  store volatile i32 %a, i32* %s.gep, align 4
+
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
+
+  %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
+  %c = load i32, i32* %c.gep, align 4
+
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+  ret void
+}
+
+
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
 
 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 151cc1b..04eae8f 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -49,7 +49,7 @@ tailrecurse.switch:                               ; preds = %tailrecurse
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: beq
+; V8-NEXT: bne
 ; V8-NEXT: b	
 ; The trailing space in the last line checks that the branch is unconditional
   switch i32 %and, label %sw.epilog [
diff --git a/test/CodeGen/ARM/ssat-v4t.ll b/test/CodeGen/ARM/ssat-v4t.ll
new file mode 100644
index 0000000..3d74c88
--- /dev/null
+++ b/test/CodeGen/ARM/ssat-v4t.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s
+
+; CHECK: Cannot select: intrinsic %llvm.arm.ssat
+define i32 @ssat() nounwind {
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 1)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/ssat.ll b/test/CodeGen/ARM/ssat.ll
index 2b75bc4..f1e11dd 100644
--- a/test/CodeGen/ARM/ssat.ll
+++ b/test/CodeGen/ARM/ssat.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2
 
 ; Check for several conditions that should result in SSAT.
 ; For example, the base test is equivalent to
@@ -16,7 +17,8 @@
 ; 32-bit base test
 define i32 @sat_base_32bit(i32 %x) #0 {
 ; CHECK-LABEL: sat_base_32bit:
-; CHECK: ssat r0, #24, r0
+; V6T2: ssat r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i32 %x, -8388608
   %cmpUp = icmp sgt i32 %x, 8388607
@@ -29,7 +31,8 @@ entry:
 ; 16-bit base test
 define i16 @sat_base_16bit(i16 %x) #0 {
 ; CHECK-LABEL: sat_base_16bit:
-; CHECK: ssat r0, #12, r0
+; V6T2: ssat r0, #12, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i16 %x, -2048
   %cmpUp = icmp sgt i16 %x, 2047
@@ -42,7 +45,8 @@ entry:
 ; 8-bit base test
 define i8 @sat_base_8bit(i8 %x) #0 {
 ; CHECK-LABEL: sat_base_8bit:
-; CHECK: ssat r0, #6, r0
+; V6T2: ssat r0, #6, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i8 %x, -32
   %cmpUp = icmp sgt i8 %x, 31
@@ -60,7 +64,8 @@ entry:
 ; x < -k ? -k : (x < k ? x : k)
 define i32 @sat_lower_upper_1(i32 %x) #0 {
 ; CHECK-LABEL: sat_lower_upper_1:
-; CHECK: ssat r0, #24, r0
+; V6T2: ssat r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i32 %x, -8388608
   %cmpUp = icmp slt i32 %x, 8388607
@@ -72,7 +77,8 @@ entry:
 ; x > -k ? (x > k ? k : x) : -k
 define i32 @sat_lower_upper_2(i32 %x) #0 {
 ; CHECK-LABEL: sat_lower_upper_2:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp sgt i32 %x, -8388608
   %cmpUp = icmp sgt i32 %x, 8388607
@@ -84,7 +90,8 @@ entry:
 ; x < k ? (x < -k ? -k : x) : k
 define i32 @sat_upper_lower_1(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_1:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp slt i32 %x, 8388607
   %cmpLow = icmp slt i32 %x, -8388608
@@ -96,7 +103,8 @@ entry:
 ; x > k ? k : (x < -k ? -k : x)
 define i32 @sat_upper_lower_2(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_2:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp sgt i32 %x, 8388607
   %cmpLow = icmp slt i32 %x, -8388608
@@ -108,7 +116,8 @@ entry:
 ; k < x ? k : (x > -k ? x : -k)
 define i32 @sat_upper_lower_3(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_3:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp slt i32 8388607, %x
   %cmpLow = icmp sgt i32 %x, -8388608
@@ -125,7 +134,8 @@ entry:
 ; k <= x ? k : (x >= -k ? x : -k)
 define i32 @sat_le_ge(i32 %x) #0 {
 ; CHECK-LABEL: sat_le_ge:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp sle i32 8388607, %x
   %cmpLow = icmp sge i32 %x, -8388608
diff --git a/test/CodeGen/ARM/usat-v4t.ll b/test/CodeGen/ARM/usat-v4t.ll
new file mode 100644
index 0000000..572c760
--- /dev/null
+++ b/test/CodeGen/ARM/usat-v4t.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.usat
+define i32 @usat1() nounwind {
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 31)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.usat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
index f736ddd..c0229c6 100644
--- a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
+++ b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
@@ -11,13 +11,13 @@ entry:
 ; PIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
 ; STATIC-O32: lui  $[[R0:[0-9]+]], %hi($CPI0_0)
 ; STATIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
-; PIC-N32: lw  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; PIC-N32: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
-; STATIC-N32: lui  $[[R0:[0-9]+]], %hi($CPI0_0)
-; STATIC-N32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
-; PIC-N64: ld  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; PIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
-; STATIC-N64: ld  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; STATIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
+; PIC-N32: lw  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; PIC-N32: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
+; STATIC-N32: lui  $[[R0:[0-9]+]], %hi(.LCPI0_0)
+; STATIC-N32: lwc1 $f0, %lo(.LCPI0_0)($[[R0]])
+; PIC-N64: ld  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; PIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
+; STATIC-N64: ld  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; STATIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
   ret float 0x400B333340000000
 }
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index 7d66d1a..5f0a0a5 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -27,9 +27,9 @@ entry:
 ; PIC-O32: addu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
 ; PIC-O32: jr  $[[R5]]
 ; N64: dsll $[[R0:[0-9]+]], ${{[0-9]+}}, 3
-; N64: ld $[[R1:[0-9]+]], %got_page($JTI0_0)
+; N64: ld $[[R1:[0-9]+]], %got_page(.LJTI0_0)
 ; N64: daddu $[[R2:[0-9]+]], $[[R0:[0-9]+]], $[[R1]]
-; N64: ld $[[R4:[0-9]+]], %got_ofst($JTI0_0)($[[R2]])
+; N64: ld $[[R4:[0-9]+]], %got_ofst(.LJTI0_0)($[[R2]])
 ; N64: daddu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
 ; N64: jr  $[[R5]]
   switch i32 %0, label %bb4 [
@@ -68,7 +68,7 @@ bb5:                                              ; preds = %entry
 ; PIC-O32: .gpword 
 ; PIC-O32: .gpword 
 ; N64: .p2align  3
-; N64: $JTI0_0:
+; N64: .LJTI0_0:
 ; N64: .gpdword
 ; N64: .gpdword
 ; N64: .gpdword 
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index 377fe93..6215087 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -10,7 +10,7 @@ define double @foo(double %a, double %b) nounwind readnone {
 entry:
 ; ALL-LABEL: foo:
 
-; FCC:           bc1f $BB
+; FCC:           bc1f {{\$|\.L}}BB
 ; FCC:           nop
 
 ; 32-GPR:        mtc1      $zero, $[[Z:f[0-9]]]
@@ -19,7 +19,7 @@ entry:
 ; GPR:           cmp.lt.d  $[[FGRCC:f[0-9]+]], $[[Z]], $f12
 ; GPR:           mfc1      $[[GPRCC:[0-9]+]], $[[FGRCC]]
 ; GPR-NOT:       not       $[[GPRCC]], $[[GPRCC]]
-; GPR:           bnezc     $[[GPRCC]], $BB
+; GPR:           bnezc     $[[GPRCC]], {{\$|\.L}}BB
 
   %cmp = fcmp ogt double %a, 0.000000e+00
   br i1 %cmp, label %if.end6, label %if.else
@@ -43,7 +43,7 @@ define void @f1(float %f) nounwind {
 entry:
 ; ALL-LABEL: f1:
 
-; FCC:           bc1f $BB
+; FCC:           bc1f {{\$|\.L}}BB
 ; FCC:           nop
 
 ; GPR:           mtc1     $zero, $[[Z:f[0-9]]]
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 8f4ccb1..dfba8ba 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -34,17 +34,17 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R1:[0-9]+]]
 ; O0-NEXT:       ll      $[[R2:[0-9]+]], 0($[[R1]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R3:[0-9]+]], 0($[[R0]])
 ; ALL:           addu    $[[R4:[0-9]+]], $[[R3]], $4
 ; ALL:           sc      $[[R4]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R4]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R4]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R4]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R4]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R4]], [[BB0]]
+; MIPSR6:        beqzc   $[[R4]], [[BB0]]
 }
 
 define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
@@ -59,14 +59,14 @@ entry:
 
 
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; ALL:           and     $[[R3:[0-9]+]], $[[R1]], $4
 ; ALL:           nor     $[[R2:[0-9]+]], $zero, $[[R3]]
 ; ALL:           sc      $[[R2]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
 
 define i32 @AtomicSwap32(i32 signext %newval) nounwind {
@@ -82,12 +82,12 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      ${{[0-9]+}}, 0($[[R0]])
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
 
 define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
@@ -104,16 +104,16 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $2, 0($[[R0]])
-; NOT-MICROMIPS: bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $2, $4, $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
-; ALL:       $[[BB1]]:
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
+; ALL:       [[BB1]]:
 }
 
 
@@ -141,20 +141,20 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           addu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:           and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
 ; ALL:           and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -186,20 +186,20 @@ entry:
 ; ALL:        nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:        sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:    $[[BB0:[A-Z_0-9]+]]:
+; ALL:    [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:        ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:        subu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:        and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:        and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:        or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:        sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:  beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:     beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:  beqzc   $[[R16]], [[BB0]]
+; MIPSR6:     beqzc   $[[R16]], [[BB0]]
 
 ; ALL:        and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:        srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -231,11 +231,11 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           nor     $[[R14:[0-9]+]], $zero, $[[R13]]
@@ -243,9 +243,9 @@ entry:
 ; ALL:           and     $[[R16:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R17:[0-9]+]], $[[R16]], $[[R15]]
 ; ALL:           sc      $[[R17]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R17]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R17]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R17]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R17]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R17]], [[BB0]]
+; MIPSR6:        beqzc   $[[R17]], [[BB0]]
 
 ; ALL:           and     $[[R18:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R19:[0-9]+]], $[[R18]], $[[R5]]
@@ -277,15 +277,15 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], [[BB0]]
+; MIPSR6:        beqzc   $[[R14]], [[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -322,21 +322,21 @@ entry:
 ; ALL:           andi    $[[R11:[0-9]+]], $5, 255
 ; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
-; ALL:       $[[BB1]]:
+; ALL:       [[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
 
 ; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
@@ -366,21 +366,21 @@ entry:
 ; ALL:           andi    $[[R11:[0-9]+]], $6, 255
 ; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
-; ALL:       $[[BB1]]:
+; ALL:       [[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
 
 ; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
@@ -423,20 +423,20 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           addu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:           and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
 ; ALL:           and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -465,15 +465,15 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; ALL:           sync
 
 ; ALL:           andi    $[[R3:[0-9]+]], $[[R2]], 65535
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R4:[0-9]+]], 0($[[R5:[0-9]+]])
 ; ALL:           and     $[[R6:[0-9]+]], $[[R4]], $
 ; ALL:           and     $[[R7:[0-9]+]], $[[R4]], $
 ; ALL:           or      $[[R8:[0-9]+]], $[[R7]], $
 ; ALL:           sc      $[[R8]], 0($[[R5]])
-; NOT-MICROMIPS: beqz    $[[R8]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R8]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R8]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R8]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R8]], [[BB0]]
+; MIPSR6:        beqzc   $[[R8]], [[BB0]]
 
 ; ALL:           srlv    $[[R9:[0-9]+]], $[[R6]], $
 
@@ -538,11 +538,11 @@ entry:
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
 ; ALL:           addiu   $[[PTR:[0-9]+]], $[[R0]], 1024
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[PTR]])
 ; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
 ; ALL:           sc      $[[R2]], 0($[[PTR]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
diff --git a/test/CodeGen/Mips/blez_bgez.ll b/test/CodeGen/Mips/blez_bgez.ll
index dcda047..84c8af4 100644
--- a/test/CodeGen/Mips/blez_bgez.ll
+++ b/test/CodeGen/Mips/blez_bgez.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=mips64el < %s | FileCheck %s
 
 ; CHECK-LABEL: test_blez:
-; CHECK: blez ${{[0-9]+}}, $BB
+; CHECK: blez ${{[0-9]+}}, {{\$|\.L}}BB
 
 define void @test_blez(i32 %a) {
 entry:
@@ -20,7 +20,7 @@ if.end:
 declare void @foo1()
 
 ; CHECK-LABEL: test_bgez:
-; CHECK: bgez ${{[0-9]+}}, $BB
+; CHECK: bgez ${{[0-9]+}}, {{\$|\.L}}BB
 
 define void @test_bgez(i32 %a) {
 entry:
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index f743637..9bc9a30 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -22,22 +22,22 @@ entry:
 ; STATIC-O32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]])
 ; STATIC-O32: lui   $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]])
 ; STATIC-O32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]])
-; PIC-N32: lw  $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]])
-; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]])
-; PIC-N32: lw  $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]])
-; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]])
-; STATIC-N32: lui  $[[R2:[0-9]+]], %hi($tmp[[T2:[0-9]+]])
-; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]])
-; STATIC-N32: lui   $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]])
-; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]])
-; PIC-N64: ld  $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]])
-; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]])
-; PIC-N64: ld  $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]])
-; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]])
-; STATIC-N64: ld  $[[R2:[0-9]+]], %got_page($tmp[[T2:[0-9]+]])
-; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst($tmp[[T2]])
-; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page($tmp[[T3:[0-9]+]])
-; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst($tmp[[T3]])
+; PIC-N32: lw  $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]])
+; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]])
+; PIC-N32: lw  $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]])
+; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]])
+; STATIC-N32: lui  $[[R2:[0-9]+]], %hi(.Ltmp[[T2:[0-9]+]])
+; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo(.Ltmp[[T2]])
+; STATIC-N32: lui   $[[R3:[0-9]+]], %hi(.Ltmp[[T3:[0-9]+]])
+; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo(.Ltmp[[T3]])
+; PIC-N64: ld  $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]])
+; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]])
+; PIC-N64: ld  $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]])
+; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]])
+; STATIC-N64: ld  $[[R2:[0-9]+]], %got_page(.Ltmp[[T2:[0-9]+]])
+; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst(.Ltmp[[T2]])
+; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page(.Ltmp[[T3:[0-9]+]])
+; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst(.Ltmp[[T3]])
 ; STATIC-MIPS16-1: .ent	f
 ; STATIC-MIPS16-2: .ent	f
 ; STATIC-MIPS16-1: li  $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]])
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index d6d4767..9352294 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -33,9 +33,15 @@ declare void @foo()
 
 ; ALL: GCC_except_table{{[0-9]+}}:
 ; ALL: .byte 155 # @TType Encoding = indirect pcrel sdata4
-; ALL: $[[PC_LABEL:tmp[0-9]+]]:
-; ALL: .4byte	($_ZTISt9exception.DW.stub)-($[[PC_LABEL]])
-; ALL: $_ZTISt9exception.DW.stub:
+; O32: [[PC_LABEL:\$tmp[0-9]+]]:
+; N32: [[PC_LABEL:\.Ltmp[0-9]+]]:
+; N64: [[PC_LABEL:\.Ltmp[0-9]+]]:
+; O32: .4byte	($_ZTISt9exception.DW.stub)-([[PC_LABEL]])
+; N32: .4byte	.L_ZTISt9exception.DW.stub-[[PC_LABEL]]
+; N64: .4byte	.L_ZTISt9exception.DW.stub-[[PC_LABEL]]
+; O32: $_ZTISt9exception.DW.stub:
+; N32: .L_ZTISt9exception.DW.stub:
+; N64: .L_ZTISt9exception.DW.stub:
 ; O32: .4byte _ZTISt9exception
 ; N32: .4byte _ZTISt9exception
 ; N64: .8byte _ZTISt9exception
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
index 142ee11..bd04ed0 100644
--- a/test/CodeGen/Mips/fcmp.ll
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -1076,12 +1076,12 @@ entry:
 ; 32-CMP-DAG:    bnezc    $[[T4]],
 
 ; 64-C-DAG:      add.s    $[[T0:f[0-9]+]], $f13, $f12
-; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)(
 ; 64-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
 ; 64-C-DAG:      bc1t
 
 ; 64-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f13, $f12
-; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)(
 ; 64-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
 ; FIXME: This instruction is redundant.
@@ -1106,8 +1106,8 @@ entry:
 ; MM64R6-DAG:    daddu    $[[T1:[0-9]+]], $[[T0]], $25
 ; MM64R6-DAG:    daddiu   $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f32)))
 ; MM64R6-DAG:    add.s    $[[T3:f[0-9]+]], $f13, $f12
-; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page($CPI32_0)($[[T2]])
-; MM64R6-DAG:    lwc1     $[[T5:f[0-9]+]], %got_ofst($CPI32_0)($[[T4]])
+; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page(.LCPI32_0)($[[T2]])
+; MM64R6-DAG:    lwc1     $[[T5:f[0-9]+]], %got_ofst(.LCPI32_0)($[[T4]])
 ; MM64R6-DAG:    cmp.le.s $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
 ; MM64R6-DAG:    mfc1     $[[T7:[0-9]+]], $[[T6]]
 ; MM64R6-DAG:    andi16   $[[T8:[0-9]+]], $[[T7]], 1
@@ -1145,12 +1145,12 @@ entry:
 ; 32-CMP-DAG:    bnezc    $[[T4]],
 
 ; 64-C-DAG:      add.d    $[[T0:f[0-9]+]], $f13, $f12
-; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)(
 ; 64-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
 ; 64-C-DAG:      bc1t
 
 ; 64-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f13, $f12
-; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)(
 ; 64-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
 ; FIXME: This instruction is redundant.
@@ -1175,8 +1175,8 @@ entry:
 ; MM64R6-DAG:    daddu    $[[T1:[0-9]+]], $[[T0]], $25
 ; MM64R6-DAG:    daddiu   $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f64)))
 ; MM64R6-DAG:    add.d    $[[T3:f[0-9]+]], $f13, $f12
-; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page($CPI33_0)($[[T2]])
-; MM64R6-DAG:    ldc1     $[[T5:f[0-9]+]], %got_ofst($CPI33_0)($[[T4]])
+; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page(.LCPI33_0)($[[T2]])
+; MM64R6-DAG:    ldc1     $[[T5:f[0-9]+]], %got_ofst(.LCPI33_0)($[[T4]])
 ; MM64R6-DAG:    cmp.le.d $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
 ; MM64R6-DAG:    mfc1     $[[T7:[0-9]+]], $[[T6]]
 ; MM64R6-DAG:    andi16   $[[T8:[0-9]+]], $[[T7]], 1
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index bf1b045..7fb508f 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -10,8 +10,9 @@ entry:
 ; ALL-LABEL: func0:
 
 ; 32-FCC:        c.eq.s $f12, $f14
+; 32-FCC:        bc1f   $BB0_2
 ; 64-FCC:        c.eq.s $f12, $f13
-; FCC:           bc1f   $BB0_2
+; 64-FCC:        bc1f   .LBB0_2
 
 ; 32-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f14
 ; 64-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f13
@@ -19,7 +20,7 @@ entry:
 ; FIXME: We ought to be able to transform not+bnez -> beqz
 ; GPR:           not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB0_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB0_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB0_2
 
   %cmp = fcmp oeq float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -45,15 +46,16 @@ entry:
 ; ALL-LABEL: func1:
 
 ; 32-FCC:        c.olt.s $f12, $f14
+; 32-FCC:        bc1f    $BB1_2
 ; 64-FCC:        c.olt.s $f12, $f13
-; FCC:           bc1f    $BB1_2
+; 64-FCC:        bc1f    .LBB1_2
 
 ; 32-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB1_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB1_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB1_2
 
   %cmp = fcmp olt float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -75,15 +77,16 @@ entry:
 ; ALL-LABEL: func2:
 
 ; 32-FCC:        c.ole.s $f12, $f14
+; 32-FCC:        bc1t    $BB2_2
 ; 64-FCC:        c.ole.s $f12, $f13
-; FCC:           bc1t    $BB2_2
+; 64-FCC:        bc1t    .LBB2_2
 
 ; 32-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        beqz     $[[GPRCC]], $BB2_2
-; 64-GPR:        beqzc    $[[GPRCC]], $BB2_2
+; 64-GPR:        beqzc    $[[GPRCC]], .LBB2_2
 
   %cmp = fcmp ugt float %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
@@ -105,8 +108,9 @@ entry:
 ; ALL-LABEL: func3:
 
 ; 32-FCC:        c.eq.d $f12, $f14
+; 32-FCC:        bc1f $BB3_2
 ; 64-FCC:        c.eq.d $f12, $f13
-; FCC:           bc1f $BB3_2
+; 64-FCC:        bc1f .LBB3_2
 
 ; 32-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f14
 ; 64-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f13
@@ -114,7 +118,7 @@ entry:
 ; FIXME: We ought to be able to transform not+bnez -> beqz
 ; GPR:           not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB3_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB3_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB3_2
 
   %cmp = fcmp oeq double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -136,15 +140,16 @@ entry:
 ; ALL-LABEL: func4:
 
 ; 32-FCC:        c.olt.d $f12, $f14
+; 32-FCC:        bc1f $BB4_2
 ; 64-FCC:        c.olt.d $f12, $f13
-; FCC:           bc1f $BB4_2
+; 64-FCC:        bc1f .LBB4_2
 
 ; 32-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB4_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB4_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB4_2
 
   %cmp = fcmp olt double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -166,15 +171,16 @@ entry:
 ; ALL-LABEL: func5:
 
 ; 32-FCC:        c.ole.d $f12, $f14
+; 32-FCC:        bc1t $BB5_2
 ; 64-FCC:        c.ole.d $f12, $f13
-; FCC:           bc1t $BB5_2
+; 64-FCC:        bc1t .LBB5_2
 
 ; 32-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        beqz     $[[GPRCC]], $BB5_2
-; 64-GPR:        beqzc    $[[GPRCC]], $BB5_2
+; 64-GPR:        beqzc    $[[GPRCC]], .LBB5_2
 
   %cmp = fcmp ugt double %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
diff --git a/test/CodeGen/Mips/jumptable_labels.ll b/test/CodeGen/Mips/jumptable_labels.ll
new file mode 100644
index 0000000..8c7edc1
--- /dev/null
+++ b/test/CodeGen/Mips/jumptable_labels.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64 -target-abi=n32 < %s | FileCheck %s -check-prefix=N32
+; RUN: llc -march=mips64 < %s | FileCheck %s -check-prefix=N64
+
+; We only use the '$' prefix on O32. The others use the ELF convention.
+; O32: $JTI0_0
+; N32: .LJTI0_0
+; N64: .LJTI0_0
+
+; Check basic block labels while we're at it.
+; O32: $BB0_2:
+; N32: .LBB0_2:
+; N64: .LBB0_2:
+
+@.str = private unnamed_addr constant [2 x i8] c"A\00", align 1
+@.str.1 = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"C\00", align 1
+@.str.3 = private unnamed_addr constant [2 x i8] c"D\00", align 1
+@.str.4 = private unnamed_addr constant [2 x i8] c"E\00", align 1
+@.str.5 = private unnamed_addr constant [2 x i8] c"F\00", align 1
+@.str.6 = private unnamed_addr constant [2 x i8] c"G\00", align 1
+@.str.7 = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+
+define i8* @_Z3fooi(i32 signext %Letter) {
+entry:
+  %retval = alloca i8*, align 8
+  %Letter.addr = alloca i32, align 4
+  store i32 %Letter, i32* %Letter.addr, align 4
+  %0 = load i32, i32* %Letter.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+  ]
+
+sw.bb:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb1:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb2:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb3:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb4:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb5:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.5, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb6:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.6, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.epilog:
+  store i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str.7, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+return:
+  %1 = load i8*, i8** %retval, align 8
+  ret i8* %1
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll
index af9b81f..cfb9855 100644
--- a/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -167,18 +167,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrav     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             beqz      $[[T3]], [[BB1:.LBB[0-9_]+]]
   ; M3:             nop
   ; M3:             dsra      $2, $4, 63
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
index d982b57..8fed32a 100644
--- a/test/CodeGen/Mips/llvm-ir/indirectbr.ll
+++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -18,13 +18,13 @@ define i32 @br(i8 *%addr) {
 ; R6C:           jrc $4 # <MCInst #{{[0-9]+}} JIC
 
 
-; ALL: $BB0_1: # %L1
+; ALL: {{\$|\.L}}BB0_1: # %L1
 ; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
 ; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
 ; R6C:           jr $ra # <MCInst #{{[0-9]+}} JALR
 ; ALL:           addiu $2, $zero, 0
 
-; ALL: $BB0_2: # %L2
+; ALL: {{\$|\.L}}BB0_2: # %L2
 ; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
 ; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
 ; R6C:           jr $ra # <MCInst #{{[0-9]+}} JALR
diff --git a/test/CodeGen/Mips/llvm-ir/lshr.ll b/test/CodeGen/Mips/llvm-ir/lshr.ll
index 10748b9..63fb075 100644
--- a/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -158,18 +158,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
   ; M3:             daddiu    $2, $zero, 0
   ; M3:             move      $2, $[[T1]]
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-dbl.ll b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
index 1ca5b4e..42f02c4 100644
--- a/test/CodeGen/Mips/llvm-ir/select-dbl.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
@@ -58,10 +58,10 @@ entry:
   ; SEL-32:     sel.d   $f0, $[[F1]], $[[F0]]
 
   ; M3:         andi    $[[T0:[0-9]+]], $4, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         mov.d   $f13, $f14
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         mov.d   $f0, $f13
 
@@ -106,10 +106,10 @@ entry:
   ; SEL-32:     sel.d   $f0, $f14, $f12
 
   ; M3:         andi    $[[T0:[0-9]+]], $6, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         mov.d   $f12, $f13
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         mov.d   $f0, $f12
 
@@ -135,11 +135,12 @@ entry:
 
   ; M2:         c.olt.d   $f12, $f14
   ; M3:         c.olt.d   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -172,11 +173,12 @@ entry:
 
   ; M2:         c.ole.d   $f12, $f14
   ; M3:         c.ole.d   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -209,11 +211,12 @@ entry:
 
   ; M2:         c.ule.d   $f12, $f14
   ; M3:         c.ule.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -246,11 +249,12 @@ entry:
 
   ; M2:         c.ult.d   $f12, $f14
   ; M3:         c.ult.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -283,11 +287,12 @@ entry:
 
   ; M2:         c.eq.d    $f12, $f14
   ; M3:         c.eq.d    $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -320,11 +325,12 @@ entry:
 
   ; M2:         c.ueq.d   $f12, $f14
   ; M3:         c.ueq.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-flt.ll b/test/CodeGen/Mips/llvm-ir/select-flt.ll
index 6a0334d..e632897 100644
--- a/test/CodeGen/Mips/llvm-ir/select-flt.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-flt.ll
@@ -34,12 +34,13 @@ entry:
   ; ALL-LABEL: tst_select_i1_float:
 
   ; M2-M3:      andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         jr      $ra
   ; M2:         mtc1    $6, $f0
   ; M3:         mov.s   $f13, $f14
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr      $ra
   ; M2:         mtc1    $5, $f0
   ; M3:         mov.s   $f0, $f13
@@ -76,11 +77,12 @@ entry:
   ; ALL-LABEL: tst_select_i1_float_reordered:
 
   ; M2-M3:      andi    $[[T0:[0-9]+]], $6, 1
-  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s   $f12, $f14
   ; M3:         mov.s   $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr      $ra
   ; M2-M3:      mov.s   $f0, $f12
 
@@ -112,11 +114,12 @@ entry:
 
   ; M2:         c.olt.s   $f12, $f14
   ; M3:         c.olt.s   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -149,11 +152,12 @@ entry:
 
   ; M2:         c.ole.s   $f12, $f14
   ; M3:         c.ole.s   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -186,11 +190,12 @@ entry:
 
   ; M2:         c.ule.s   $f12, $f14
   ; M3:         c.ule.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -223,11 +228,12 @@ entry:
 
   ; M2:         c.ult.s   $f12, $f14
   ; M3:         c.ult.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -260,11 +266,12 @@ entry:
 
   ; M2:         c.eq.s    $f12, $f14
   ; M3:         c.eq.s    $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -297,11 +304,12 @@ entry:
 
   ; M2:         c.ueq.s   $f12, $f14
   ; M3:         c.ueq.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-int.ll b/test/CodeGen/Mips/llvm-ir/select-int.ll
index e8f78ff..5bee3f1 100644
--- a/test/CodeGen/Mips/llvm-ir/select-int.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-int.ll
@@ -35,10 +35,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i1:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -70,10 +71,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i8:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -105,10 +107,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i32:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -170,10 +173,10 @@ entry:
   ; SEL-32:     or      $3, $[[T4]], $[[T6]]
 
   ; M3:         andi    $[[T0:[0-9]+]], $4, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         move    $5, $6
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         move    $2, $5
 
@@ -214,19 +217,19 @@ define i8* @tst_select_word_cst(i8* %a, i8* %b) {
   ; M2:         addiu   $[[T0:[0-9]+]], $zero, -1
   ; M2:         xor     $[[T1:[0-9]+]], $5, $[[T0]]
   ; M2:         sltu    $[[T2:[0-9]+]], $zero, $[[T1]]
-  ; M2:         bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T2]], [[BB0:\$BB[0-9_]+]]
   ; M2:         addiu   $2, $zero, 0
   ; M2:         move    $2, $4
-  ; M2: $[[BB0]]:
+  ; M2: [[BB0]]:
   ; M2:         jr      $ra
 
   ; M3:         daddiu  $[[T0:[0-9]+]], $zero, -1
   ; M3:         xor     $[[T1:[0-9]+]], $5, $[[T0]]
   ; M3:         sltu    $[[T2:[0-9]+]], $zero, $[[T1]]
-  ; M3:         bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T2]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         daddiu  $2, $zero, 0
   ; M3:         move    $2, $4
-  ; M3: $[[BB0]]:
+  ; M3: [[BB0]]:
   ; M3:         jr      $ra
 
   ; CMOV-32:    addiu   $[[T0:[0-9]+]], $zero, -1
diff --git a/test/CodeGen/Mips/llvm-ir/shl.ll b/test/CodeGen/Mips/llvm-ir/shl.ll
index fa43840..74b6155 100644
--- a/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -174,18 +174,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsllv     $[[T1:[0-9]+]], $5, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $2, $[[T1]]
   ; M3:             dsllv     $[[T4:[0-9]+]], $4, $7
   ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $2, $[[T4]], $[[T7]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
   ; M3:             daddiu    $3, $zero, 0
   ; M3:             move      $3, $[[T1]]
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 06eda11..59e2841 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -84,28 +84,28 @@ end:
 ; Check the MIPS64 version.
 
 ; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
-; N64:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; N64:        bnez    $4, [[BB0:\.LBB[0-9_]+]]
 ; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
 
 ; Check for long branch expansion:
 ; N64:           daddiu  $sp, $sp, -16
 ; N64-NEXT:      sd      $ra, 0($sp)
-; N64-NEXT:      daddiu  $1, $zero, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; N64-NEXT:      daddiu  $1, $zero, %hi([[BB2:\.LBB[0-9_]+]]-[[BB1:\.LBB[0-9_]+]])
 ; N64-NEXT:      dsll    $1, $1, 16
-; N64-NEXT:      bal     $[[BB1]]
-; N64-NEXT:      daddiu  $1, $1, %lo(($[[BB2]])-($[[BB1]]))
-; N64-NEXT:  $[[BB1]]:
+; N64-NEXT:      bal     [[BB1]]
+; N64-NEXT:      daddiu  $1, $1, %lo([[BB2]]-[[BB1]])
+; N64-NEXT:  [[BB1]]:
 ; N64-NEXT:      daddu   $1, $ra, $1
 ; N64-NEXT:      ld      $ra, 0($sp)
 ; N64-NEXT:      jr      $1
 ; N64-NEXT:      daddiu  $sp, $sp, 16
 
-; N64:   $[[BB0]]:
+; N64:   [[BB0]]:
 ; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
 ; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
 ; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
 ; N64:        sw      $[[R3]], 0($[[R2]])
-; N64:   $[[BB2]]:
+; N64:   [[BB2]]:
 ; N64:        jr      $ra
 ; N64:        nop
 
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 5d253d7..d7a0580 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -36,14 +36,14 @@ define void @const_v16i8() nounwind {
 
   store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 31>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, <16 x i8>*@v16i8
@@ -59,8 +59,8 @@ define void @const_v16i8() nounwind {
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -77,8 +77,8 @@ define void @const_v8i16() nounwind {
 
   store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 2, i16 1, i16 1, i16 1, i16 31>, <8 x i16>*@v8i16
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>, <8 x i16>*@v8i16
@@ -93,8 +93,8 @@ define void @const_v8i16() nounwind {
 
   store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -111,8 +111,8 @@ define void @const_v4i32() nounwind {
 
   store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 31>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>, <4 x i32>*@v4i32
@@ -123,14 +123,14 @@ define void @const_v4i32() nounwind {
 
   store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -156,15 +156,15 @@ define void @const_v2i64() nounwind {
 
   store volatile <2 x i64> <i64 1, i64 31>, <2 x i64>*@v2i64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; MIPS32: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]])
   ; MIPS64: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x i64> <i64 3, i64 4>, <2 x i64>*@v2i64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; MIPS32: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
   ; MIPS64: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index d714b3e..1546878 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -23,8 +23,8 @@ define void @const_v4f32() nounwind {
 
   store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 31.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 65537.0, float 65537.0, float 65537.0, float 65537.0>, <4 x float>*@v4f32
@@ -34,14 +34,14 @@ define void @const_v4f32() nounwind {
 
   store volatile <4 x float> <float 1.0, float 2.0, float 1.0, float 2.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 3.0, float 4.0, float 5.0, float 6.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -55,38 +55,38 @@ define void @const_v2f64() nounwind {
 
   store volatile <2 x double> <double 72340172838076673.0, double 72340172838076673.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 281479271743489.0, double 281479271743489.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 4294967297.0, double 4294967297.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 1.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 31.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 3.0, double 4.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
index b441274..7e2a810 100644
--- a/test/CodeGen/Mips/octeon.ll
+++ b/test/CodeGen/Mips/octeon.ll
@@ -91,9 +91,9 @@ entry:
 define i64 @bbit0(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit0:
-; OCTEON: bbit0   $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit0   $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
-; MIPS64: bnez  $[[T0]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez  $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 8
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -107,11 +107,11 @@ endif:
 define i64 @bbit032(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit032:
-; OCTEON: bbit032 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit032 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
 ; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
 ; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
-; MIPS64: bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez    $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 34359738368
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -125,9 +125,9 @@ endif:
 define i64 @bbit1(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit1:
-; OCTEON: bbit1 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit1 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
-; MIPS64: beqz  $[[T0]], $[[BB0:BB[0-9_]+]]
+; MIPS64: beqz  $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 8
   %res = icmp ne i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -141,11 +141,11 @@ endif:
 define i64 @bbit132(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit132:
-; OCTEON: bbit132 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit132 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
 ; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
 ; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
-; MIPS64: beqz    $[[T2]], $[[BB0:BB[0-9_]+]]
+; MIPS64: beqz    $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 34359738368
   %res = icmp ne i64 %bit, 0
   br i1 %res, label %endif, label %if
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index c7cf857..f886e1f 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
 ; X64-NEXT:    vcvttpd2dqy %ymm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
-  %cvt = fptosi <4 x double> %a0 to <4 x i32>
+  %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
   %res = bitcast <4 x i32> %cvt to <2 x i64>
   ret <2 x i64> %res
 }
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
 ; X32-LABEL: test_mm256_cvttps_epi32:
@@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %cvt = fptosi <8 x float> %a0 to <8 x i32>
+  %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
   %res = bitcast <8 x i32> %cvt to <4 x i64>
   ret <4 x i64> %res
 }
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_div_pd:
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index a7b4c6b..0630fd8 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
 
 
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT:    retl
-  %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb LCPI32_0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 3576329..c5d60da 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
 
 
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX-NEXT:    retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512VL-NEXT:    retl
+  %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; AVX-LABEL: test_x86_avx_dp_ps_256:
 ; AVX:       ## BB#0:
@@ -4552,7 +4585,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX-LABEL: movnt_dq:
 ; AVX:       ## BB#0:
 ; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT:    vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq LCPI256_0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovntdq %ymm0, (%eax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retl
@@ -4560,7 +4593,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX512VL-LABEL: movnt_dq:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512VL-NEXT:    vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddq LCPI256_0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovntdq %ymm0, (%eax)
 ; AVX512VL-NEXT:    retl
   %a2 = add <2 x i64> %a1, <i64 1, i64 1>
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 914f859..d2410e4 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -744,6 +744,36 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
   ret <8 x double> %1
 }
 
+define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
+; KNL-LABEL: sitofp_16i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; KNL-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_16i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; SKX-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; SKX-NEXT:    vpmovm2d %k1, %ymm0
+; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; SKX-NEXT:    vpmovm2d %k0, %ymm1
+; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %cmpres to <16 x double>
+  ret <16 x double> %1
+}
+
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; KNL-LABEL: sitofp_8i1_double:
 ; KNL:       ## BB#0:
@@ -767,6 +797,130 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
   ret <8 x double> %1
 }
 
+define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
+; KNL-LABEL: sitofp_8i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_8i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %ymm0
+; SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x float>
+  ret <8 x float> %1
+}
+
+define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
+; KNL-LABEL: sitofp_4i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_4i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
+; KNL-LABEL: sitofp_4i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_4i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
+; KNL-LABEL: sitofp_2i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; KNL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm1
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    xorl %ecx, %ecx
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    movl $-1, %eax
+; KNL-NEXT:    movl $0, %edx
+; KNL-NEXT:    cmovnel %eax, %edx
+; KNL-NEXT:    vcvtsi2ssl %edx, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rdx
+; KNL-NEXT:    testb $1, %dl
+; KNL-NEXT:    cmovnel %eax, %ecx
+; KNL-NEXT:    vcvtsi2ssl %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_2i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
+; KNL-LABEL: sitofp_2i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_2i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x double>
+  ret <2 x double> %1
+}
+
 define <16 x float> @uitofp_16i8(<16 x i8>%a) {
 ; ALL-LABEL: uitofp_16i8:
 ; ALL:       ## BB#0:
@@ -787,3 +941,196 @@ define <16 x float> @uitofp_16i16(<16 x i16>%a) {
   ret <16 x float>%b
 }
 
+define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
+; ALL-LABEL: uitofp_16i1_float:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
+; KNL-LABEL: uitofp_16i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    movq {{.*}}(%rip), %rax
+; KNL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
+; KNL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_16i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; SKX-NEXT:    movl {{.*}}(%rip), %eax
+; SKX-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; SKX-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
+; KNL-LABEL: uitofp_8i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_8i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
+; KNL-LABEL: uitofp_8i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_8i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x double>
+  ret <8 x double> %1
+}
+
+define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
+; KNL-LABEL: uitofp_4i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_4i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
+; KNL-LABEL: uitofp_4i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_4i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
+; KNL-LABEL: uitofp_2i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_2i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
+; KNL-LABEL: uitofp_2i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtusi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtusi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_2i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x double>
+  ret <2 x double> %1
+}
diff --git a/test/CodeGen/X86/pr28504.ll b/test/CodeGen/X86/pr28504.ll
new file mode 100644
index 0000000..a617c8a
--- /dev/null
+++ b/test/CodeGen/X86/pr28504.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; The test case is rather involved, because we need to get to a state where
+; We have a sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) combine,
+; BUT this combine is only triggered post-legalization, so the setcc's return
+; type is i8. So we can't have the combine opportunity be exposed too early.
+; Basically, what we want to see is that the compare result zero-extended, and 
+; then stored. Only one zext, and no sexts.
+
+; CHECK-LABEL: main:
+; CHECK: movzbl (%rdi), %[[EAX:.*]]
+; CHECK-NEXT: xorl  %e[[C:.]]x, %e[[C]]x
+; CHECK-NEXT: cmpl  $1, %[[EAX]]
+; CHECK-NEXT: sete  %[[C]]l
+; CHECK-NEXT: movl  %e[[C]]x, (%rsi)
+define void @main(i8* %p, i32* %q) {
+bb:
+  %tmp4 = load i8, i8* %p, align 1
+  %tmp5 = sext i8 %tmp4 to i32
+  %tmp6 = load i8, i8* %p, align 1
+  %tmp7 = zext i8 %tmp6 to i32
+  %tmp8 = sub nsw i32 %tmp5, %tmp7
+  %tmp11 = icmp eq i32 %tmp7, 1
+  %tmp12 = zext i1 %tmp11 to i32
+  %tmp13 = add nsw i32 %tmp8, %tmp12
+  %tmp14 = trunc i32 %tmp13 to i8
+  %tmp15 = sext i8 %tmp14 to i16
+  %tmp16 = sext i16 %tmp15 to i32
+  store i32 %tmp16, i32* %q, align 4
+  br i1 %tmp11, label %bb21, label %bb22
+
+bb21:                                             ; preds = %bb
+  unreachable
+
+bb22:                                             ; preds = %bb
+  ret void
+}
diff --git a/test/CodeGen/X86/pr28824.ll b/test/CodeGen/X86/pr28824.ll
new file mode 100644
index 0000000..ced1f00
--- /dev/null
+++ b/test/CodeGen/X86/pr28824.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+@d = global i32 0, align 4
+
+; Verify the sar happens before ecx is clobbered with the parameter being
+; passed to fn3
+; CHECK-LABEL: fn4
+; CHECK: movb d, %cl
+; CHECK: sarl %cl
+; CHECK: movl $2, %ecx
+define i32 @fn4(i32 %i) #0 {
+entry:
+  %0 = load i32, i32* @d, align 4
+  %shr = ashr i32 %i, %0
+  tail call fastcc void @fn3(i32 2, i32 5, i32 %shr, i32 %i)
+  %cmp = icmp slt i32 %shr, 1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
+declare void @fn3(i32 %p1, i32 %p2, i32 %p3, i32 %p4) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
index 2102b42..aad00e7 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -6,13 +6,12 @@
 define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_ss:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2ssq %rdi, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    cvtsi2ssq %rdi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i64 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
   ret <4 x float> %res
 }
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
 ; X64-LABEL: test_mm_cvtss_si64:
@@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i64
+  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
   ret i64 %res
 }
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 090ddfd..4715b7f 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
 ; X32-LABEL: test_mm_cvtsi32_ss:
 ; X32:       # BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    cvtsi2ssl %eax, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtsi32_ss:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2ssl %edi, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    cvtsi2ssl %edi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i32 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
   ret <4 x float> %res
 }
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
 
 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
 ; X32-LABEL: test_mm_cvtss_f32:
@@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i32
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 
 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
 ; X32-LABEL: test_mm_cvttss_si32:
@@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i32
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
index f5ecfa4..6b9dc40 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
 define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2sdq %rdi, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtsi2sdq %rdi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i64 %a1 to double
-  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
   ret <2 x double> %res
 }
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
 define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_si128:
@@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttsd2si %xmm0, %rax
 ; X64-NEXT:    retq
-  %ext = extractelement <2 x double> %a0, i32 0
-  %res = fptosi double %ext to i64
+  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
   ret i64 %res
 }
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
 
 define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
 ; X64-LABEL: test_mm_loadu_si64:
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index fa71325..d3ebba9 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1208,6 +1208,39 @@ define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
 }
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
+define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_cvtsd_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; X32-LABEL: test_mm_cvtsd_ss_load:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm1
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_ss_load:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm1
+; X64-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %a1 = load <2 x double>, <2 x double>* %p1
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+  ret <4 x float> %res
+}
+
 define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
 ; X32-LABEL: test_mm_cvtsi128_si32:
 ; X32:       # BB#0:
@@ -1303,10 +1336,11 @@ define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttps2dq %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %res = fptosi <4 x float> %a0 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
   ret <2 x i64> %bc
 }
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
 
 define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
 ; X32-LABEL: test_mm_cvttsd_si32:
@@ -1318,10 +1352,10 @@ define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %ext = extractelement <2 x double> %a0, i32 0
-  %res = fptosi double %ext to i32
+  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
   ret i32 %res
 }
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 
 define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_div_pd:
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index ae6626b..27a3fce 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -66,17 +66,6 @@ define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
 
 
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storel_dq:
 ; CHECK:       ## BB#0:
@@ -94,7 +83,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    paddb LCPI8_0, %xmm0
+; CHECK-NEXT:    paddb LCPI7_0, %xmm0
 ; CHECK-NEXT:    movdqu %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 617e30e..3ae3aec 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
 
@@ -274,6 +274,25 @@ define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
 
 
+define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtsd2ss_load:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movaps (%eax), %xmm1
+; SSE-NEXT:    cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss_load:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT:    vcvtsd2ss (%eax), %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %a1 = load <2 x double>, <2 x double>* %p1
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+
 define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
 ; SSE-LABEL: test_x86_sse2_cvtsi2sd:
 ; SSE:       ## BB#0:
@@ -306,6 +325,25 @@ define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
 declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
 
 
+define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtss2sd_load:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movaps (%eax), %xmm1
+; SSE-NEXT:    cvtss2sd %xmm1, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd_load:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT:    vcvtss2sd (%eax), %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %a1 = load <4 x float>, <4 x float>* %p1
+  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+
+
 define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_cvttpd2dq:
 ; SSE:       ## BB#0:
@@ -322,6 +360,22 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
 
 
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse2_cvttps2dq:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvttps2dq:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
 define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_cvttsd2si:
 ; SSE:       ## BB#0:
diff --git a/test/CodeGen/X86/tail-merge-after-mbp.ll b/test/CodeGen/X86/tail-merge-after-mbp.ll
new file mode 100644
index 0000000..dc5f3a1
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-after-mbp.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=x86_64-linux -o - %s | FileCheck %s
+
+%0 = type { %1, %3* }
+%1 = type { %2* }
+%2 = type { %2*, i8* }
+%3 = type { i32, i32 (i32, i32)* }
+
+
+declare i32 @Up(...) 
+declare i32 @f(i32, i32) 
+
+; check loop block_14 is not merged with block_21
+; check loop block_11 is not merged with block_18, block_25
+define i32 @foo(%0* nocapture readonly, i32, i1 %c, i8* %p1, %2** %p2) {
+; CHECK-LABEL: foo:
+; CHECK:     # %block_11
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je      
+; CHECK-NEXT:# %block_14
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne    
+; CHECK-NEXT:# %block_18
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je    
+; CHECK-NEXT:# %block_21
+; CHECK-NEXT:# =>This Inner Loop Header
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne  
+; CHECK-NEXT:# %block_25
+; CHECK-NEXT:#   in Loop
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        jne 
+  br i1 %c, label %block_34, label %block_3
+
+block_3:                                      ; preds = %2
+  br i1 %c, label %block_7, label %block_4
+
+block_4:                                      ; preds = %block_3
+  %a5 = tail call i32 @f(i32 undef, i32 undef)
+  %a6 = icmp eq i32 %a5, 0
+  br i1 %a6, label %block_7, label %block_34
+
+block_7:                                      ; preds = %block_4, %block_3
+  %a8 = icmp eq %2* null, null
+  br i1 %a8, label %block_34, label %block_9
+
+block_9:                                      ; preds = %block_7
+  %a10 = icmp eq i8* %p1, null
+  br i1 %a10, label %block_11, label %block_32
+
+block_11:                                     ; preds = %block_9
+  %a12 = load %2*, %2** %p2, align 8
+  %a13 = icmp eq %2* %a12, null
+  br i1 %a13, label %block_34, label %block_14
+
+block_14:                                     ; preds = %block_11
+  %a15 = getelementptr inbounds %2, %2* %a12, i64 0, i32 1
+  %a16 = load i8*, i8** %a15, align 8
+  %a17 = icmp eq i8* %a16, null
+  br i1 %a17, label %block_18, label %block_32
+
+block_18:                                     ; preds = %block_14
+  %a19 = load %2*, %2** %p2, align 8
+  %a20 = icmp eq %2* %a19, null
+  br i1 %a20, label %block_34, label %block_21
+
+block_21:                                     ; preds = %block_18
+  %a22 = getelementptr inbounds %2, %2* %a19, i64 0, i32 1
+  %a23 = load i8*, i8** %a22, align 8
+  %a24 = icmp eq i8* %a23, null
+  br i1 %a24, label %block_25, label %block_32
+
+block_25:                                     ; preds = %block_28, %block_21
+  %a26 = load %2*, %2** %p2, align 8
+  %a27 = icmp eq %2* %a26, null
+  br i1 %a27, label %block_34, label %block_28
+
+block_28:                                     ; preds = %block_25
+  %a29 = getelementptr inbounds %2, %2* %a26, i64 0, i32 1
+  %a30 = load i8*, i8** %a29, align 8
+  %a31 = icmp eq i8* %a30, null
+  br i1 %a31, label %block_25, label %block_32
+
+block_32:                                     ; preds = %block_28, %block_21, %block_14, %block_9
+  %a33 = tail call i32 (...) @Up()
+  br label %block_34
+
+block_34:                                     ; preds = %block_32, %block_25, %block_18, %block_11, %block_7, %block_4, %2
+  %a35 = phi i32 [ 0, %2 ], [ %a5, %block_4 ], [ 0, %block_7 ], [ 0, %block_11 ], [ 0, %block_32 ], [ 0, %block_18 ], [ 0, %block_25 ]
+  ret i32 %a35
+}
diff --git a/test/CodeGen/X86/twoaddr-lea.ll b/test/CodeGen/X86/twoaddr-lea.ll
index 5779cf3..2944b17 100644
--- a/test/CodeGen/X86/twoaddr-lea.ll
+++ b/test/CodeGen/X86/twoaddr-lea.ll
@@ -44,3 +44,60 @@ entry:
   %0 = shl i64 %x, 1
   ret i64 %0
 }
+
+@global = external global i32, align 4
+@global2 = external global i64, align 8
+
+; Test that liveness is properly updated and we do not encounter the
+; assert/crash from http://llvm.org/PR28301
+; CHECK-LABEL: ham
+define void @ham() {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i64 [ %tmp40, %bb9 ], [ 0, %bb ]
+  %tmp2 = phi i32 [ %tmp39, %bb9 ], [ 0, %bb ]
+  %tmp3 = icmp sgt i32 undef, 10
+  br i1 %tmp3, label %bb2, label %bb3
+
+bb2:
+  %tmp6 = load i32, i32* @global, align 4
+  %tmp8 = add nsw i32 %tmp6, %tmp2
+  %tmp9 = sext i32 %tmp8 to i64
+  br label %bb6
+
+bb3:
+; CHECK: subl %e[[REG0:[a-z0-9]+]],
+; CHECK: leaq 4({{%[a-z0-9]+}}), %r[[REG0]]
+  %tmp14 = phi i64 [ %tmp15, %bb5 ], [ 0, %bb1 ]
+  %tmp15 = add nuw i64 %tmp14, 4
+  %tmp16 = trunc i64 %tmp14 to i32
+  %tmp17 = sub i32 %tmp2, %tmp16
+  br label %bb4
+
+bb4:
+  %tmp20 = phi i64 [ %tmp14, %bb3 ], [ %tmp34, %bb5 ]
+  %tmp28 = icmp eq i32 %tmp17, 0
+  br i1 %tmp28, label %bb5, label %bb8
+
+bb5:
+  %tmp34 = add nuw nsw i64 %tmp20, 1
+  %tmp35 = icmp slt i64 %tmp34, %tmp15
+  br i1 %tmp35, label %bb4, label %bb3
+
+bb6:
+  store volatile i64 %tmp, i64* @global2, align 8
+  store volatile i64 %tmp9, i64* @global2, align 8
+  store volatile i32 %tmp6, i32* @global, align 4
+  %tmp45 = icmp slt i32 undef, undef
+  br i1 %tmp45, label %bb6, label %bb9
+
+bb8:
+  unreachable
+
+bb9:
+  %tmp39 = add nuw nsw i32 %tmp2, 4
+  %tmp40 = add nuw i64 %tmp, 4
+  br label %bb1
+}
diff --git a/test/DebugInfo/COFF/inlining-same-name.ll b/test/DebugInfo/COFF/inlining-same-name.ll
index 44b8791..fda5a6d 100644
--- a/test/DebugInfo/COFF/inlining-same-name.ll
+++ b/test/DebugInfo/COFF/inlining-same-name.ll
@@ -33,7 +33,7 @@
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
 
-define void @main(i32* %i.i) {
+define void @main(i32* %i.i) !dbg !16 {
   store volatile i32 3, i32* %i.i, !dbg !6
   store volatile i32 3, i32* %i.i, !dbg !19
   ret void
diff --git a/test/DebugInfo/COFF/pr28747.ll b/test/DebugInfo/COFF/pr28747.ll
new file mode 100644
index 0000000..d19a2fa
--- /dev/null
+++ b/test/DebugInfo/COFF/pr28747.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK:             .section .debug$S,"dr"{{$}}
+; CHECK-NEXT:        .p2align 2
+; CHECK-NEXT:        .long 4
+; CHECK-NEXT:        .cv_filechecksums
+; CHECK-NEXT:        .cv_stringtable
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @baz() {
+entry:
+  %x.i.i = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i32* %x.i.i, metadata !6, metadata !12), !dbg !13
+  store i32 5, i32* %x.i.i, align 4, !dbg !13
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+attributes #0 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 276756) (llvm/trunk 276952)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "-", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 4.0.0 (trunk 276756) (llvm/trunk 276952)"}
+!6 = !DILocalVariable(name: "x", scope: !7, file: !8, line: 1, type: !11)
+!7 = distinct !DISubprogram(name: "foo", scope: !8, file: !8, line: 1, type: !9, isLocal: true, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, variables: !2)
+!8 = !DIFile(filename: "<stdin>", directory: "/")
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !DIExpression()
+!13 = !DILocation(line: 1, column: 56, scope: !7, inlinedAt: !14)
+!14 = distinct !DILocation(line: 2, column: 52, scope: !15)
+!15 = distinct !DISubprogram(name: "bar", scope: !8, file: !8, line: 2, type: !9, isLocal: true, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
diff --git a/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll b/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll
index db12ec7..494df83 100644
--- a/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll
+++ b/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll
@@ -13,6 +13,8 @@ target triple = "x86_64-apple-macosx10.9"
 
 @__llvm_gcov_ctr = internal global [1 x i64] zeroinitializer
 @__llvm_gcov_ctr.1 = internal global [1 x i64] zeroinitializer
+@__llvm_gcov_global_state_pred = internal global i32 0
+@__llvm_gcda_foo = internal global i32 0
 
 define i32 @test_gep() sanitize_thread {
 entry:
@@ -42,5 +44,16 @@ entry:
   ret i32 undef
 }
 
+define void @test_load() sanitize_thread {
+entry:
+  %0 = load i32, i32* @__llvm_gcov_global_state_pred
+  store i32 1, i32* @__llvm_gcov_global_state_pred
+
+  %1 = load i32, i32* @__llvm_gcda_foo
+  store i32 1, i32* @__llvm_gcda_foo
+
+  ret void
+}
+
 ; CHECK-NOT: {{call void @__tsan_write}}
 ; CHECK: __tsan_init
diff --git a/test/Linker/Inputs/metadata-with-global-value-operand.ll b/test/Linker/Inputs/metadata-with-global-value-operand.ll
new file mode 100644
index 0000000..21d3e27
--- /dev/null
+++ b/test/Linker/Inputs/metadata-with-global-value-operand.ll
@@ -0,0 +1,3 @@
+!named.null = !{!0}
+
+!0 = !{null}
diff --git a/test/Linker/metadata-with-global-value-operand.ll b/test/Linker/metadata-with-global-value-operand.ll
new file mode 100644
index 0000000..fb4c01a
--- /dev/null
+++ b/test/Linker/metadata-with-global-value-operand.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-link -S -o - %s %S/Inputs/metadata-with-global-value-operand.ll | FileCheck %s
+; This test confirms that the !{null} from the second module doesn't get mapped
+; onto the abandoned !{i1* @var} node from this module.
+
+; CHECK: @var = global
+@var = global i1 false
+
+; CHECK: !named.vars = !{!0}
+; CHECK: !named.null = !{!1}
+!named.vars = !{!0}
+
+; CHECK: !0 = !{i1* @var}
+; CHECK: !1 = !{null}
+!0 = !{i1* @var}
diff --git a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
index 832aa3f..6ff5f54 100644
--- a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
+++ b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 2>&1 | FileCheck %s
 
 # Opcode=322 Name=SSAT Format=ARM_FORMAT_SATFRM(13)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
diff --git a/test/MC/Mips/cpsetup.s b/test/MC/Mips/cpsetup.s
index c5d0f9b..149419f 100644
--- a/test/MC/Mips/cpsetup.s
+++ b/test/MC/Mips/cpsetup.s
@@ -1,22 +1,22 @@
-# RUN: llvm-mc -triple mips64-unknown-linux -target-abi o32 -filetype=obj -o - %s | \
+# RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump -d -r - | FileCheck -check-prefixes=ALL,O32 %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi o32 %s | \
-# RUN:   FileCheck -check-prefixes=ALL,ASM %s
+# RUN: llvm-mc -triple mips-unknown-unknown -target-abi o32 %s | \
+# RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-O32 %s
 
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump -d -r - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
 
 # RUN: llvm-mc -triple mips64-unknown-unknown -target-abi n32 %s | \
-# RUN:   FileCheck -check-prefixes=ALL,ASM %s
+# RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-N32 %s
 
 # RUN: llvm-mc -triple mips64-unknown-linux %s -filetype=obj -o - | \
 # RUN:   llvm-objdump -d -r - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N64 %s
 
 # RUN: llvm-mc -triple mips64-unknown-unknown %s | \
-# RUN:   FileCheck -check-prefixes=ALL,ASM %s
+# RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-N64 %s
 
         .text
         .option pic2
@@ -105,8 +105,10 @@ t3:
 # NXX-NEXT: nop
 # NXX-NEXT: sub $3, $3, $2
 
-# ASM: $tmp0:
-# ASM-NEXT: .cpsetup $25, $2, $tmp0
+# ASM-O32: [[LABEL:\$tmp0]]:
+# ASM-N32: [[LABEL:\.Ltmp0]]:
+# ASM-N64: [[LABEL:\.Ltmp0]]:
+# ASM-NEXT: .cpsetup $25, $2, [[LABEL]]
 
 # Ensure we have at least one instruction between labels so that the labels
 # we're matching aren't removed.
diff --git a/test/MC/Mips/expansion-jal-sym-pic.s b/test/MC/Mips/expansion-jal-sym-pic.s
index f2ceca0..1cc4751 100644
--- a/test/MC/Mips/expansion-jal-sym-pic.s
+++ b/test/MC/Mips/expansion-jal-sym-pic.s
@@ -10,7 +10,7 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=micromips -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MICROMIPS,O32-MICROMIPS
 
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -mattr=micromips -show-encoding |\
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -mattr=micromips -show-encoding |\
 # RUN:   FileCheck %s -check-prefixes=ALL,MICROMIPS,N32-MICROMIPS
 
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -mattr=micromips -show-encoding |\
@@ -164,19 +164,19 @@ local_label:
 # N32: lw  $25, %got_disp($tmp0)($gp) # encoding: [0x8f,0x99,A,A]
 # N32:                                #   fixup A - offset: 0, value: %got_disp($tmp0), kind:   fixup_Mips_GOT_DISP
 
-# N64: ld  $25, %got_disp($tmp0)($gp) # encoding: [0xdf,0x99,A,A]
-# N64:                                #   fixup A - offset: 0, value: %got_disp($tmp0), kind:   fixup_Mips_GOT_DISP
+# N64: ld  $25, %got_disp(.Ltmp0)($gp) # encoding: [0xdf,0x99,A,A]
+# N64:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp0), kind:   fixup_Mips_GOT_DISP
 
 # O32-MICROMIPS: lw    $25, %got($tmp0)($gp)    # encoding: [0xff,0x3c,A,A]
 # O32-MICROMIPS:                                #   fixup A - offset: 0, value: %got($tmp0), kind: fixup_MICROMIPS_GOT16
 # O32-MICROMIPS: addiu $25, $25, %lo($tmp0)     # encoding: [0x33,0x39,A,A]
 # O32-MICROMIPS:                                #   fixup A - offset: 0, value: %lo($tmp0), kind: fixup_MICROMIPS_LO16
 
-# N32-MICROMIPS: lw  $25, %got_disp($tmp0)($gp) # encoding: [0xff,0x3c,A,A]
-# N32-MICROMIPS:                                #   fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_MICROMIPS_GOT_DISP
+# N32-MICROMIPS: lw  $25, %got_disp(.Ltmp0)($gp) # encoding: [0xff,0x3c,A,A]
+# N32-MICROMIPS:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp0), kind: fixup_MICROMIPS_GOT_DISP
 
-# N64-MICROMIPS: ld  $25, %got_disp($tmp0)($gp) # encoding: [0xdf,0x99,A,A]
-# N64-MICROMIPS:                                #   fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_MICROMIPS_GOT_DISP
+# N64-MICROMIPS: ld  $25, %got_disp(.Ltmp0)($gp) # encoding: [0xdf,0x99,A,A]
+# N64-MICROMIPS:                                 #   fixup A - offset: 0, value: %got_disp(.Ltmp0), kind: fixup_MICROMIPS_GOT_DISP
 
 # NORMAL:    jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MICROMIPS: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
diff --git a/test/MC/Mips/macro-la.s b/test/MC/Mips/macro-la.s
index cca4805..c419d64 100644
--- a/test/MC/Mips/macro-la.s
+++ b/test/MC/Mips/macro-la.s
@@ -1,11 +1,11 @@
 # RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | \
-# RUN:   FileCheck %s
+# RUN:   FileCheck %s --check-prefixes=CHECK,O32
 # RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | \
-# RUN:   FileCheck %s
+# RUN:   FileCheck %s --check-prefixes=CHECK,O32
 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 -target-abi=n32 | \
-# RUN:   FileCheck %s
+# RUN:   FileCheck %s --check-prefixes=CHECK,N32
 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 -target-abi=n32 | \
-# RUN:   FileCheck %s
+# RUN:   FileCheck %s --check-prefixes=CHECK,N32
 
 # N64 should be acceptable too but we cannot convert la to dla yet.
 
@@ -272,8 +272,12 @@ la $6, symbol+8($6)   # CHECK: lui $1, %hi(symbol+8)       # encoding: [0x3c,0x0
                       # CHECK: addiu $1, $1, %lo(symbol+8) # encoding: [0x24,0x21,A,A]
                       # CHECK:                             #   fixup A - offset: 0, value: %lo(symbol+8), kind: fixup_Mips_LO16
                       # CHECK: addu $6, $1, $6             # encoding: [0x00,0x26,0x30,0x21]
-la $5, 1f             # CHECK: lui $5, %hi($tmp0)          # encoding: [0x3c,0x05,A,A]
-                      # CHECK:                             #   fixup A - offset: 0, value: %hi($tmp0), kind: fixup_Mips_HI16
-                      # CHECK: addiu $5, $5, %lo($tmp0)    # encoding: [0x24,0xa5,A,A]
-                      # CHECK:                             #   fixup A - offset: 0, value: %lo($tmp0), kind: fixup_Mips_LO16
+la $5, 1f             # O32: lui $5, %hi($tmp0)            # encoding: [0x3c,0x05,A,A]
+                      # O32:                               #   fixup A - offset: 0, value: %hi($tmp0), kind: fixup_Mips_HI16
+                      # O32: addiu $5, $5, %lo($tmp0)      # encoding: [0x24,0xa5,A,A]
+                      # O32:                               #   fixup A - offset: 0, value: %lo($tmp0), kind: fixup_Mips_LO16
+                      # N32: lui $5, %hi(.Ltmp0)           # encoding: [0x3c,0x05,A,A]
+                      # N32:                               #   fixup A - offset: 0, value: %hi(.Ltmp0), kind: fixup_Mips_HI16
+                      # N32: addiu $5, $5, %lo(.Ltmp0)     # encoding: [0x24,0xa5,A,A]
+                      # N32:                               #   fixup A - offset: 0, value: %lo(.Ltmp0), kind: fixup_Mips_LO16
 1:
diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s
index bcc96b5..d9f7729 100644
--- a/test/MC/Mips/mips3/valid.s
+++ b/test/MC/Mips/mips3/valid.s
@@ -112,8 +112,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index 0a4eb4d..500560e 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s
@@ -116,8 +116,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index 270ff16..c60a918 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s
@@ -116,8 +116,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index 0ba831b..b8c8a10 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s
@@ -123,8 +123,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s
index 5ae3adc..7dd7289 100644
--- a/test/MC/Mips/mips64r2/valid.s
+++ b/test/MC/Mips/mips64r2/valid.s
@@ -136,8 +136,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips64r3/valid.s b/test/MC/Mips/mips64r3/valid.s
index ab385da..83681f6 100644
--- a/test/MC/Mips/mips64r3/valid.s
+++ b/test/MC/Mips/mips64r3/valid.s
@@ -136,8 +136,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/mips64r5/valid.s b/test/MC/Mips/mips64r5/valid.s
index 39782f3..e63ed1d 100644
--- a/test/MC/Mips/mips64r5/valid.s
+++ b/test/MC/Mips/mips64r5/valid.s
@@ -137,8 +137,8 @@ a:
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
-        j         1f                   # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A]
-                                       # CHECK:         #   fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26
+        j         1f                   # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A]
+                                       # CHECK:          #   fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26
         j         a                    # CHECK: j a     # encoding: [0b000010AA,A,A,A]
                                        # CHECK:         #   fixup A - offset: 0, value: a, kind: fixup_Mips_26
         j         1328                 # CHECK: j 1328  # encoding: [0x08,0x00,0x01,0x4c]
diff --git a/test/MC/Mips/relocation.s b/test/MC/Mips/relocation.s
index abbbc6d..42a015d 100644
--- a/test/MC/Mips/relocation.s
+++ b/test/MC/Mips/relocation.s
@@ -116,7 +116,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ?????: R_MIPS_SHIFT5 foo
                                            // ?????: R_MIPS_SHIFT6 foo
 
-// DATA-NEXT:  0060: 24620000 24620000 24620000 24620000
+// DATA-NEXT:  0060: 24620000 24620000 24620004 24620000
         addiu $2, $3, %got_disp(foo)       // RELOC: R_MIPS_GOT_DISP foo
                                            // ENCBE: addiu $2, $3, %got_disp(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %got_disp(foo) # encoding: [A,A,0x62,0x24]
@@ -127,17 +127,27 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: addiu $2, $3, %got_page(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %got_page(foo), kind: fixup_Mips_GOT_PAGE
 
+        addiu $2, $3, %got_page(bar)       // RELOC: R_MIPS_GOT_PAGE .data
+                                           // ENCBE: addiu $2, $3, %got_page(bar) # encoding: [0x24,0x62,A,A]
+                                           // ENCLE: addiu $2, $3, %got_page(bar) # encoding: [A,A,0x62,0x24]
+                                           // FIXUP: # fixup A - offset: 0, value: %got_page(bar), kind: fixup_Mips_GOT_PAGE
+
         addiu $2, $3, %got_ofst(foo)       // RELOC: R_MIPS_GOT_OFST foo
                                            // ENCBE: addiu $2, $3, %got_ofst(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %got_ofst(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %got_ofst(foo), kind: fixup_Mips_GOT_OFST
 
+// DATA-NEXT:  0070: 24620004 24620000 24620000 64620000
+        addiu $2, $3, %got_ofst(bar)       // RELOC: R_MIPS_GOT_OFST .data
+                                           // ENCBE: addiu $2, $3, %got_ofst(bar) # encoding: [0x24,0x62,A,A]
+                                           // ENCLE: addiu $2, $3, %got_ofst(bar) # encoding: [A,A,0x62,0x24]
+                                           // FIXUP: # fixup A - offset: 0, value: %got_ofst(bar), kind: fixup_Mips_GOT_OFST
+
         addiu $2, $3, %got_hi(foo)         // RELOC: R_MIPS_GOT_HI16 foo
                                            // ENCBE: addiu $2, $3, %got_hi(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %got_hi(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %got_hi(foo), kind: fixup_Mips_GOT_HI16
 
-// DATA-NEXT:  0070: 24620000 64620000 64620000 24620000
         addiu $2, $3, %got_lo(foo)         // RELOC: R_MIPS_GOT_LO16 foo
                                            // ENCBE: addiu $2, $3, %got_lo(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %got_lo(foo) # encoding: [A,A,0x62,0x24]
@@ -154,6 +164,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: daddiu $2, $3, %higher(foo) # encoding: [A,A,0x62,0x64]
                                            // FIXUP: # fixup A - offset: 0, value: %higher(foo), kind: fixup_Mips_HIGHER
 
+// DATA-NEXT:  0080: 64620000 24620000 24620000 24620000
         daddiu $2, $3, %highest(foo)       // RELOC: R_MIPS_HIGHEST foo
                                            // ENCBE: daddiu $2, $3, %highest(foo) # encoding: [0x64,0x62,A,A]
                                            // ENCLE: daddiu $2, $3, %highest(foo) # encoding: [A,A,0x62,0x64]
@@ -165,7 +176,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: addiu $2, $3, %call_hi(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %call_hi(foo), kind: fixup_Mips_CALL_HI16
 
-// DATA-NEXT:  0080: 24620000 24620000 24620000 24620000
+// DATA-NEXT:  0090: 24620000 24620000 24620000 24620000
         addiu $2, $3, %call_lo(foo)        // RELOC: R_MIPS_CALL_LO16 foo
                                            // ENCBE: addiu $2, $3, %call_lo(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %call_lo(foo) # encoding: [A,A,0x62,0x24]
@@ -321,7 +332,22 @@ foo_mm:
                                            // ENCLE: addiu $2, $2, %lo(long_mm) # encoding: [0x42'A',0x30'A',0x00,0x00]
                                            // FIXUP: # fixup A - offset: 0, value: %lo(long_mm), kind: fixup_MICROMIPS_LO16
 
-// DATA-NEXT:  0020: 30430000 30420000 30430000 30420004
+// DATA-NEXT:  0020: 30430004 00000000 30430004 00000000
+        addiu $2, $3, %got_page(bar)       // RELOC: R_MICROMIPS_GOT_PAGE .data
+                                           // ENCBE: addiu $2, $3, %got_page(bar) # encoding: [0x30,0x43,A,A]
+                                           // The placement of the 'A' annotations is incorrect. They use 32-bit little endian instead of 2x 16-bit little endian.
+                                           // ENCLE: addiu $2, $3, %got_page(bar) # encoding: [0x43'A',0x30'A',0x00,0x00]
+                                           // FIXUP: # fixup A - offset: 0, value: %got_page(bar), kind: fixup_MICROMIPS_GOT_PAGE
+        nop
+
+        addiu $2, $3, %got_ofst(bar)       // RELOC: R_MICROMIPS_GOT_OFST .data
+                                           // ENCBE: addiu $2, $3, %got_ofst(bar) # encoding: [0x30,0x43,A,A]
+                                           // The placement of the 'A' annotations is incorrect. They use 32-bit little endian instead of 2x 16-bit little endian.
+                                           // ENCLE: addiu $2, $3, %got_ofst(bar) # encoding: [0x43'A',0x30'A',0x00,0x00]
+                                           // FIXUP: # fixup A - offset: 0, value: %got_ofst(bar), kind: fixup_MICROMIPS_GOT_OFST
+        nop
+
+// DATA-NEXT:  0030: 30430000 30420000 30430000 30420004
         addiu $2, $3, %hi(foo_mm)          // RELOC: R_MICROMIPS_HI16 foo_mm
                                            // ENCBE: addiu $2, $3, %hi(foo_mm) # encoding: [0x30,0x43,A,A]
                                            // ENCLE: addiu $2, $3, %hi(foo_mm) # encoding: [0x43'A',0x30'A',0x00,0x00]
@@ -342,5 +368,5 @@ foo_mm:
                                            // ENCLE: addiu $2, $2, %lo(bar) # encoding: [0x42'A',0x30'A',0x00,0x00]
                                            // FIXUP: # fixup A - offset: 0, value: %lo(bar), kind: fixup_MICROMIPS_LO16
 
-        .space 65536, 0
+        .space 65520, 0
 long_mm:
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index a445ac8..d9a884a 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -1,47 +1,47 @@
 ; RUN: opt < %s -constprop -S | FileCheck %s
 ; RUN: opt < %s -constprop -disable-simplify-libcalls -S | FileCheck %s --check-prefix=FNOBUILTIN
 
-declare double @acos(double)
-declare double @asin(double)
-declare double @atan(double)
-declare double @atan2(double, double)
-declare double @ceil(double)
-declare double @cos(double)
-declare double @cosh(double)
-declare double @exp(double)
-declare double @exp2(double)
-declare double @fabs(double)
-declare double @floor(double)
-declare double @fmod(double, double)
-declare double @log(double)
-declare double @log10(double)
-declare double @pow(double, double)
-declare double @sin(double)
-declare double @sinh(double)
-declare double @sqrt(double)
-declare double @tan(double)
-declare double @tanh(double)
+declare double @acos(double) readnone nounwind
+declare double @asin(double) readnone nounwind
+declare double @atan(double) readnone nounwind
+declare double @atan2(double, double) readnone nounwind
+declare double @ceil(double) readnone nounwind
+declare double @cos(double) readnone nounwind
+declare double @cosh(double) readnone nounwind
+declare double @exp(double) readnone nounwind
+declare double @exp2(double) readnone nounwind
+declare double @fabs(double) readnone nounwind
+declare double @floor(double) readnone nounwind
+declare double @fmod(double, double) readnone nounwind
+declare double @log(double) readnone nounwind
+declare double @log10(double) readnone nounwind
+declare double @pow(double, double) readnone nounwind
+declare double @sin(double) readnone nounwind
+declare double @sinh(double) readnone nounwind
+declare double @sqrt(double) readnone nounwind
+declare double @tan(double) readnone nounwind
+declare double @tanh(double) readnone nounwind
 
-declare float @acosf(float)
-declare float @asinf(float)
-declare float @atanf(float)
-declare float @atan2f(float, float)
-declare float @ceilf(float)
-declare float @cosf(float)
-declare float @coshf(float)
-declare float @expf(float)
-declare float @exp2f(float)
-declare float @fabsf(float)
-declare float @floorf(float)
-declare float @fmodf(float, float)
-declare float @logf(float)
-declare float @log10f(float)
-declare float @powf(float, float)
-declare float @sinf(float)
-declare float @sinhf(float)
-declare float @sqrtf(float)
-declare float @tanf(float)
-declare float @tanhf(float)
+declare float @acosf(float) readnone nounwind
+declare float @asinf(float) readnone nounwind
+declare float @atanf(float) readnone nounwind
+declare float @atan2f(float, float) readnone nounwind
+declare float @ceilf(float) readnone nounwind
+declare float @cosf(float) readnone nounwind
+declare float @coshf(float) readnone nounwind
+declare float @expf(float) readnone nounwind
+declare float @exp2f(float) readnone nounwind
+declare float @fabsf(float) readnone nounwind
+declare float @floorf(float) readnone nounwind
+declare float @fmodf(float, float) readnone nounwind
+declare float @logf(float) readnone nounwind
+declare float @log10f(float) readnone nounwind
+declare float @powf(float, float) readnone nounwind
+declare float @sinf(float) readnone nounwind
+declare float @sinhf(float) readnone nounwind
+declare float @sqrtf(float) readnone nounwind
+declare float @tanf(float) readnone nounwind
+declare float @tanhf(float) readnone nounwind
 
 define double @T() {
 ; CHECK-LABEL: @T(
@@ -193,11 +193,13 @@ entry:
   ret i1 %b
 }
 
-; TODO: Inexact values should not fold as they are dependent on rounding mode
+; Inexact values should not fold as they are dependent on rounding mode
 define i1 @test_sse_cvts_inexact() nounwind readnone {
 ; CHECK-LABEL: @test_sse_cvts_inexact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
 entry:
   %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
   %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
diff --git a/test/Transforms/EarlyCSE/basic.ll b/test/Transforms/EarlyCSE/basic.ll
index fa1a705..3c427d8 100644
--- a/test/Transforms/EarlyCSE/basic.ll
+++ b/test/Transforms/EarlyCSE/basic.ll
@@ -276,3 +276,17 @@ define void @dse_neg2(i32 *%P) {
   ret void
 }
 
+@c = external global i32, align 4
+declare i32 @reads_c(i32 returned)
+define void @pr28763() {
+entry:
+; CHECK-LABEL: @pr28763(
+; CHECK: store i32 0, i32* @c, align 4
+; CHECK: call i32 @reads_c(i32 0)
+; CHECK: store i32 2, i32* @c, align 4
+  %load = load i32, i32* @c, align 4
+  store i32 0, i32* @c, align 4
+  %call = call i32 @reads_c(i32 0)
+  store i32 2, i32* @c, align 4
+  ret void
+}
diff --git a/test/Transforms/GlobalOpt/metadata.ll b/test/Transforms/GlobalOpt/metadata.ll
index 152d58e..b766349 100644
--- a/test/Transforms/GlobalOpt/metadata.ll
+++ b/test/Transforms/GlobalOpt/metadata.ll
@@ -28,5 +28,5 @@ declare void @llvm.foo(metadata, metadata) nounwind readnone
 ; CHECK: !named = !{![[NULL:[0-9]+]]}
 
 !0 = !{i8*** @G}
-; CHECK-DAG: ![[NULL]] = !{null}
+; CHECK-DAG: ![[NULL]] = distinct !{null}
 ; CHECK-DAG: ![[EMPTY]] = !{}
diff --git a/test/Transforms/IndVarSimplify/pr28935.ll b/test/Transforms/IndVarSimplify/pr28935.ll
new file mode 100644
index 0000000..0cfd1d3
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr28935.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i16 @fn1(i16 returned, i64)
+
+define void @fn2() {
+; CHECK-LABEL: @fn2(
+entry:
+  br label %for.cond
+
+for.cond:
+  %f.0 = phi i64 [ undef, %entry ], [ %inc, %for.cond ]
+  %conv = trunc i64 %f.0 to i16
+  %call = tail call i16 @fn1(i16 %conv, i64 %f.0)
+  %conv2 = zext i16 %call to i32
+  %inc = add nsw i64 %f.0, 1
+  br label %for.cond
+}
diff --git a/test/Transforms/Inline/inalloca-not-static.ll b/test/Transforms/Inline/inalloca-not-static.ll
new file mode 100644
index 0000000..e70e30d
--- /dev/null
+++ b/test/Transforms/Inline/inalloca-not-static.ll
@@ -0,0 +1,63 @@
+; RUN: opt -always-inline -S < %s | FileCheck %s
+
+; We used to misclassify inalloca as a static alloca in the inliner. This only
+; arose with for alwaysinline functions, because the normal inliner refuses to
+; inline such things.
+
+; Generated using this C++ source:
+; struct Foo {
+;   Foo();
+;   Foo(const Foo &o);
+;   ~Foo();
+;   int a;
+; };
+; __forceinline void h(Foo o) {}
+; __forceinline void g() { h(Foo()); }
+; void f() { g(); }
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.0.24210"
+
+%struct.Foo = type { i32 }
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+
+declare x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* returned) unnamed_addr
+declare x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo*) unnamed_addr
+
+define void @f() {
+entry:
+  call void @g()
+  ret void
+}
+
+define internal void @g() alwaysinline {
+entry:
+  %inalloca.save = call i8* @llvm.stacksave()
+  %argmem = alloca inalloca <{ %struct.Foo }>, align 4
+  %0 = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %argmem, i32 0, i32 0
+  %call = call x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* %0)
+  call void @h(<{ %struct.Foo }>* inalloca %argmem)
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret void
+}
+
+; Function Attrs: alwaysinline inlinehint nounwind
+define internal void @h(<{ %struct.Foo }>* inalloca) alwaysinline {
+entry:
+  %o = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %0, i32 0, i32 0
+  call x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo* %o)
+  ret void
+}
+
+; CHECK: define void @f()
+; CHECK:   %inalloca.save.i = call i8* @llvm.stacksave()
+; CHECK:   alloca inalloca <{ %struct.Foo }>, align 4
+; CHECK:   %call.i = call x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* %0)
+; CHECK:   %o.i.i = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %argmem.i, i32 0, i32 0
+; CHECK:   call x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo* %o.i.i)
+; CHECK:   call void @llvm.stackrestore(i8* %inalloca.save.i)
+; CHECK:   ret void
diff --git a/test/Transforms/Inline/inline_constprop.ll b/test/Transforms/Inline/inline_constprop.ll
index de23b61..ab9e90c 100644
--- a/test/Transforms/Inline/inline_constprop.ll
+++ b/test/Transforms/Inline/inline_constprop.ll
@@ -279,3 +279,46 @@ return:
   %retval.0 = phi i32* [ %b, %if.end3 ], [ %a, %if.then ]
   ret i32* %retval.0
 }
+
+declare i32 @PR28802.external(i32 returned %p1)
+
+define internal i32 @PR28802.callee() {
+entry:
+  br label %cont
+
+cont:
+  %0 = phi i32 [ 0, %entry ]
+  %call = call i32 @PR28802.external(i32 %0)
+  ret i32 %call
+}
+
+define i32 @PR28802() {
+entry:
+  %call = call i32 @PR28802.callee()
+  ret i32 %call
+}
+
+; CHECK-LABEL: define i32 @PR28802(
+; CHECK: call i32 @PR28802.external(i32 0)
+; CHECK: ret i32 0
+
+define internal i32 @PR28848.callee(i32 %p2, i1 %c) {
+entry:
+  br i1 %c, label %cond.end, label %cond.true
+
+cond.true:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ 0, %cond.true ], [ %p2, %entry ]
+  %or = or i32 %cond, %p2
+  ret i32 %or
+}
+
+define i32 @PR28848() {
+entry:
+  %call = call i32 @PR28848.callee(i32 0, i1 false)
+  ret i32 %call
+}
+; CHECK-LABEL: define i32 @PR28848(
+; CHECK: ret i32 0
diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll
index ea338f0..5307dcb 100644
--- a/test/Transforms/InstCombine/call.ll
+++ b/test/Transforms/InstCombine/call.ll
@@ -276,3 +276,14 @@ define <2 x i16> @test16() {
   %X = call <2 x i16> bitcast (i32 ()* @test16a to <2 x i16> ()*)( )
   ret <2 x i16> %X
 }
+
+declare i32 @pr28655(i32 returned %V)
+
+define i32 @test17() {
+entry:
+  %C = call i32 @pr28655(i32 0)
+  ret i32 %C
+}
+; CHECK-LABEL: @test17(
+; CHECK: call i32 @pr28655(i32 0)
+; CHECK: ret i32 0
diff --git a/test/Transforms/InstCombine/log-pow.ll b/test/Transforms/InstCombine/log-pow.ll
index a0c10d0..4e4a2b2 100644
--- a/test/Transforms/InstCombine/log-pow.ll
+++ b/test/Transforms/InstCombine/log-pow.ll
@@ -55,7 +55,8 @@ define double @log_exp2_not_fast(double %x) {
 ; CHECK-NEXT:  %call3 = call fast double @log(double %call2)
 ; CHECK-NEXT:  ret double %call3
 
-declare double @log(double)
+declare double @log(double) #0
 declare double @exp2(double)
 declare double @llvm.pow.f64(double, double)
 
+attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index e0e7bfc..413be89 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -1737,3 +1737,26 @@ define i32 @PR27137(i32 %a) {
   %s1 = select i1 %c1, i32 %s0, i32 -1
   ret i32 %s1
 }
+
+define i32 @select_icmp_slt0_xor(i32 %x) {
+; CHECK-LABEL: @select_icmp_slt0_xor(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, -2147483648
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %cmp = icmp slt i32 %x, zeroinitializer
+  %xor = xor i32 %x, 2147483648
+  %x.xor = select i1 %cmp, i32 %x, i32 %xor
+  ret i32 %x.xor
+}
+
+define <2 x i32> @select_icmp_slt0_xor_vec(<2 x i32> %x) {
+; CHECK-LABEL: @select_icmp_slt0_xor_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> %x, <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %cmp = icmp slt <2 x i32> %x, zeroinitializer
+  %xor = xor <2 x i32> %x, <i32 2147483648, i32 2147483648>
+  %x.xor = select <2 x i1> %cmp, <2 x i32> %x, <2 x i32> %xor
+  ret <2 x i32> %x.xor
+}
+
diff --git a/test/Transforms/LCSSA/pr28424.ll b/test/Transforms/LCSSA/pr28424.ll
new file mode 100644
index 0000000..cd79690
--- /dev/null
+++ b/test/Transforms/LCSSA/pr28424.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -lcssa -S -o - | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR28424
+; Here LCSSA adds phi-nodes for %x into the loop exits. Then, SSAUpdater needs
+; to insert phi-nodes to merge these values. That creates a new def, which in
+; its turn needs another LCCSA phi-node, and this test ensures that we insert
+; it.
+
+; CHECK-LABEL: @foo1
+define internal i32 @foo1() {
+entry:
+  br label %header
+
+header:
+  %x = add i32 0, 1
+  br i1 undef, label %if, label %loopexit1
+
+if:
+  br i1 undef, label %latch, label %loopexit2
+
+latch:
+  br i1 undef, label %header, label %loopexit3
+
+; CHECK: loopexit1:
+; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
+loopexit1:
+  br label %loop_with_insert_point
+
+; CHECK: loopexit2:
+; CHECK:   %x.lcssa1 = phi i32 [ %x, %if ]
+loopexit2:
+  br label %exit
+
+; CHECK: loopexit3:
+; CHECK:   %x.lcssa2 = phi i32 [ %x, %latch ]
+loopexit3:
+  br label %loop_with_insert_point
+
+; CHECK: loop_with_insert_point:
+; CHECK:   %x4 = phi i32 [ %x4, %loop_with_insert_point ], [ %x.lcssa2, %loopexit3 ], [ %x.lcssa, %loopexit1 ]
+loop_with_insert_point:
+  br i1 undef, label %loop_with_insert_point, label %bb
+
+; CHECK: bb:
+; CHECK:   %x4.lcssa = phi i32 [ %x4, %loop_with_insert_point ]
+bb:
+  br label %exit
+
+; CHECK: exit:
+; CHECK:   %x3 = phi i32 [ %x4.lcssa, %bb ], [ %x.lcssa1, %loopexit2 ]
+exit:
+  ret i32 %x
+}
+
+; CHECK-LABEL: @foo2
+define internal i32 @foo2() {
+entry:
+  br label %header
+
+header:
+  %x = add i32 0, 1
+  br i1 undef, label %latch, label %loopexit1
+
+latch:
+  br i1 undef, label %header, label %loopexit2
+
+; CHECK: loopexit1:
+; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
+loopexit1:
+  br label %loop_with_insert_point
+
+; CHECK: loopexit2:
+; CHECK:   %x.lcssa1 = phi i32 [ %x, %latch ]
+loopexit2:
+  br label %loop_with_insert_point
+
+; CHECK: loop_with_insert_point:
+; CHECK:   %x2 = phi i32 [ %x2, %loop_with_insert_point ], [ %x.lcssa1, %loopexit2 ], [ %x.lcssa, %loopexit1 ]
+loop_with_insert_point:
+  br i1 undef, label %loop_with_insert_point, label %exit
+
+; CHECK: exit:
+; CHECK:   %x2.lcssa = phi i32 [ %x2, %loop_with_insert_point ]
+exit:
+  ret i32 %x
+}
diff --git a/test/Transforms/LCSSA/pr28608.ll b/test/Transforms/LCSSA/pr28608.ll
new file mode 100644
index 0000000..3ba3fe8
--- /dev/null
+++ b/test/Transforms/LCSSA/pr28608.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -lcssa -disable-output
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR28608
+; Check that we don't crash on this test.
+
+define void @foo() {
+entry:
+  br label %bb1
+
+bb1:
+  br label %bb2
+
+bb2:
+  %x = phi i32 [ undef, %bb5 ], [ undef, %bb1 ]
+  br i1 undef, label %bb3, label %bb6
+
+bb3:
+  br i1 undef, label %bb5, label %bb4
+
+bb4:
+  br label %bb6
+
+bb5:
+  br label %bb2
+
+bb6:
+  br label %bb1
+
+exit:
+  %y = add i32 0, %x
+  ret void
+}
+
diff --git a/test/Transforms/LoopSimplify/pr28272.ll b/test/Transforms/LoopSimplify/pr28272.ll
new file mode 100644
index 0000000..49990f9
--- /dev/null
+++ b/test/Transforms/LoopSimplify/pr28272.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -lcssa -loop-unroll -S | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR28272
+; When LoopSimplify separates nested loops, it might break LCSSA form: values
+; from the original loop might be used in the outer loop. This test invokes
+; loop-unroll, which calls loop-simplify before itself. If LCSSA is broken
+; after loop-simplify, we crash on assertion.
+
+; CHECK-LABEL: @foo
+define void @foo() {
+entry:
+  br label %header
+
+header:
+  br label %loop1
+
+loop1:
+  br i1 true, label %loop1, label %bb43
+
+bb43:
+  %a = phi i32 [ undef, %loop1 ], [ 0, %bb45 ], [ %a, %bb54 ]
+  %b = phi i32 [ 0, %loop1 ], [ 1, %bb54 ], [ %c, %bb45 ]
+  br i1 true, label %bb114, label %header
+
+bb114:
+  %c = add i32 0, 1
+  %d = add i32 0, 1
+  br i1 true, label %bb45, label %bb54
+
+bb45:
+  %x = add i32 %d, 0
+  br label %bb43
+
+bb54:
+  br label %bb43
+}
+
+; CHECK-LABEL: @foo2
+define void @foo2() {
+entry:
+  br label %outer
+
+outer.loopexit:
+  br label %outer
+
+outer:
+  br label %loop1
+
+loop1:
+  br i1 true, label %loop1, label %loop2.preheader
+
+loop2.preheader:
+  %a.ph = phi i32 [ undef, %loop1 ]
+  %b.ph = phi i32 [ 0, %loop1 ]
+  br label %loop2
+
+loop2:
+  %a = phi i32 [ 0, %loop2.if.true ], [ %a, %loop2.if.false ], [ %a.ph, %loop2.preheader ], [0, %bb]
+  %b = phi i32 [ 1, %loop2.if.false ], [ %c, %loop2.if.true ], [ %b.ph, %loop2.preheader ], [%c, %bb]
+  br i1 true, label %loop2.if, label %outer.loopexit
+
+loop2.if:
+  %c = add i32 0, 1
+  switch i32 undef, label %loop2.if.false [i32 0, label %loop2.if.true
+                                       i32 1, label %bb]
+
+loop2.if.true:
+  br i1 undef, label %loop2, label %bb
+
+loop2.if.false:
+  br label %loop2
+
+bb:
+  br label %loop2
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/pr28719.ll b/test/Transforms/LoopStrengthReduce/X86/pr28719.ll
new file mode 100644
index 0000000..0e74ff2
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/pr28719.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global i32 0, align 4
+@b = global i8 0, align 1
+@c = global [4 x i8] zeroinitializer, align 1
+
+; Just make sure we don't generate code with uses not dominated by defs.
+; CHECK-LABEL: @main(
+define i32 @main() {
+entry:
+  %a0 = load i32, i32* @a, align 4
+  %cmpa = icmp slt i32 %a0, 4
+  br i1 %cmpa, label %preheader, label %for.end
+
+preheader:
+  %b0 = load i8, i8* @b, align 1
+  %b0sext = sext i8 %b0 to i64
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %preheader ], [ %iv.next, %lor.false ]
+  %mul = mul nsw i64 %b0sext, %iv
+  %multrunc = trunc i64 %mul to i32
+  %cmp = icmp eq i32 %multrunc, 0
+  br i1 %cmp, label %lor.false, label %if.then
+
+lor.false:
+  %cgep = getelementptr inbounds [4 x i8], [4 x i8]* @c, i64 0, i64 %iv
+  %ci = load i8, i8* %cgep, align 1
+  %cisext = sext i8 %ci to i32
+  %ivtrunc = trunc i64 %iv to i32
+  %cmp2 = icmp eq i32 %cisext, %ivtrunc
+  %iv.next = add i64 %iv, 1
+  br i1 %cmp2, label %for.body, label %if.then
+
+if.then:
+  tail call void @abort()
+  unreachable
+
+for.end:
+  ret i32 0
+}
+
+declare void @abort()
diff --git a/test/Transforms/LoopVectorize/pr28541.ll b/test/Transforms/LoopVectorize/pr28541.ll
new file mode 100644
index 0000000..7bb7f09
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr28541.ll
@@ -0,0 +1,71 @@
+; RUN: opt -loop-vectorize -pass-remarks=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; FIXME: Check for -pass-remarks-missed and -pass-remarks-analysis output when
+; addAcyclicInnerLoop emits analysis.
+
+; Check that opt does not crash on such input:
+;
+; a, b, c;
+; fn1() {
+;   while (b--) {
+;     c = a;
+;     switch (a & 3)
+;     case 0:
+;       do
+;     case 3:
+;     case 2:
+;     case 1:
+;         ;
+;         while (--c)
+;           ;
+;   }
+; }
+
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+@c = common global i32 0, align 4
+
+; CHECK-NOT: vectorized loop
+; CHECK-LABEL: fn1
+
+define i32 @fn1() {
+entry:
+  %tmp2 = load i32, i32* @b, align 4
+  %dec3 = add nsw i32 %tmp2, -1
+  store i32 %dec3, i32* @b, align 4
+  %tobool4 = icmp eq i32 %tmp2, 0
+  br i1 %tobool4, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %tmp1 = load i32, i32* @a, align 4
+  %and = and i32 %tmp1, 3
+  %switch = icmp eq i32 %and, 0
+  br label %while.body
+
+while.cond:                                       ; preds = %do.cond
+  %dec = add nsw i32 %dec7, -1
+  %tobool = icmp eq i32 %dec7, 0
+  br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.cond
+  %dec7 = phi i32 [ %dec3, %while.body.lr.ph ], [ %dec, %while.cond ]
+  br i1 %switch, label %do.body, label %do.cond
+
+do.body:                                          ; preds = %do.cond, %while.body
+  %dec25 = phi i32 [ %dec2, %do.cond ], [ %tmp1, %while.body ]
+  br label %do.cond
+
+do.cond:                                          ; preds = %do.body, %while.body
+  %dec26 = phi i32 [ %dec25, %do.body ], [ %tmp1, %while.body ]
+  %dec2 = add nsw i32 %dec26, -1
+  %tobool3 = icmp eq i32 %dec2, 0
+  br i1 %tobool3, label %while.cond, label %do.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.cond
+  store i32 0, i32* @c, align 4
+  store i32 -1, i32* @b, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  ret i32 undef
+}
diff --git a/test/Transforms/SafeStack/coloring-ssp.ll b/test/Transforms/SafeStack/coloring-ssp.ll
new file mode 100644
index 0000000..d71babe
--- /dev/null
+++ b/test/Transforms/SafeStack/coloring-ssp.ll
@@ -0,0 +1,34 @@
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+
+; %x and %y share a stack slot between them, but not with the stack guard.
+define void @f() safestack sspreq {
+; CHECK-LABEL: define void @f
+entry:
+; CHECK:  %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
+; CHECK:   getelementptr i8, i8* %[[USP]], i32 -16
+
+; CHECK:  %[[A:.*]] = getelementptr i8, i8* %[[USP]], i32 -8
+; CHECK:  %[[StackGuardSlot:.*]] = bitcast i8* %[[A]] to i8**
+; CHECK:  store i8* %{{.*}}, i8** %[[StackGuardSlot]]
+
+  %x = alloca i64, align 8
+  %y = alloca i64, align 8
+  %x0 = bitcast i64* %x to i8*
+  %y0 = bitcast i64* %y to i8*
+
+  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
+  call void @capture64(i64* %x)
+  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+
+  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
+  call void @capture64(i64* %y)
+  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @capture64(i64*)
diff --git a/test/Transforms/SafeStack/layout-region-split.ll b/test/Transforms/SafeStack/layout-region-split.ll
new file mode 100644
index 0000000..ceb18bb
--- /dev/null
+++ b/test/Transforms/SafeStack/layout-region-split.ll
@@ -0,0 +1,84 @@
+; Regression test for safestack layout. Used to fail with asan.
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+
+define void @f() safestack {
+; CHECK-LABEL: define void @f
+entry:
+; CHECK:  %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
+; CHECK:   getelementptr i8, i8* %[[USP]], i32 -224
+
+  %x0 = alloca i8, align 16
+  %x1 = alloca i8, align 16
+  %x2 = alloca i8, align 16
+  %x3 = alloca i8, align 16
+  %x4 = alloca i8, align 16
+  %x5 = alloca i8, align 16
+  %x6 = alloca i8, align 16
+  %x7 = alloca i8, align 16
+  %x8 = alloca i8, align 16
+  %x9 = alloca i8, align 16
+  %x10 = alloca i8, align 16
+  %x11 = alloca i8, align 16
+  %x12 = alloca i8, align 16
+  %x13 = alloca i8, align 16
+  %y0 = alloca i8, align 2
+  %y1 = alloca i8, align 2
+  %y2 = alloca i8, align 2
+  %y3 = alloca i8, align 2
+  %y4 = alloca i8, align 2
+  %y5 = alloca i8, align 2
+  %y6 = alloca i8, align 2
+  %y7 = alloca i8, align 2
+  %y8 = alloca i8, align 2
+
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -16
+  call void @capture8(i8* %x0)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -32
+  call void @capture8(i8* %x1)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -48
+  call void @capture8(i8* %x2)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -64
+  call void @capture8(i8* %x3)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -80
+  call void @capture8(i8* %x4)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -96
+  call void @capture8(i8* %x5)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -112
+  call void @capture8(i8* %x6)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -128
+  call void @capture8(i8* %x7)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -144
+  call void @capture8(i8* %x8)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -160
+  call void @capture8(i8* %x9)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -176
+  call void @capture8(i8* %x10)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -192
+  call void @capture8(i8* %x11)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -208
+  call void @capture8(i8* %x12)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -224
+  call void @capture8(i8* %x13)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -2
+  call void @capture8(i8* %y0)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -4
+  call void @capture8(i8* %y1)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -6
+  call void @capture8(i8* %y2)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -8
+  call void @capture8(i8* %y3)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -10
+  call void @capture8(i8* %y4)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -12
+  call void @capture8(i8* %y5)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -14
+  call void @capture8(i8* %y6)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -18
+  call void @capture8(i8* %y7)
+; CHECK: getelementptr i8, i8* %[[USP]], i32 -20
+  call void @capture8(i8* %y8)
+
+  ret void
+}
+
+declare void @capture8(i8*)
diff --git a/unittests/ADT/SCCIteratorTest.cpp b/unittests/ADT/SCCIteratorTest.cpp
index da8c044..597661f 100644
--- a/unittests/ADT/SCCIteratorTest.cpp
+++ b/unittests/ADT/SCCIteratorTest.cpp
@@ -230,6 +230,7 @@ public:
 template <unsigned N>
 struct GraphTraits<Graph<N> > {
   typedef typename Graph<N>::NodeType NodeType;
+  typedef typename Graph<N>::NodeType *NodeRef;
   typedef typename Graph<N>::ChildIterator ChildIteratorType;
 
  static inline NodeType *getEntryNode(const Graph<N> &G) { return G.AccessNode(0); }
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 77a2dba..15b03b3 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -449,6 +449,40 @@ TEST_F(MDNodeTest, DistinctOnUniquingCollision) {
   EXPECT_FALSE(Wrapped1->isDistinct());
 }
 
+TEST_F(MDNodeTest, UniquedOnDeletedOperand) {
+  // temp !{}
+  TempMDTuple T = MDTuple::getTemporary(Context, None);
+
+  // !{temp !{}}
+  Metadata *Ops[] = {T.get()};
+  MDTuple *N = MDTuple::get(Context, Ops);
+
+  // !{temp !{}} => !{null}
+  T.reset();
+  ASSERT_TRUE(N->isUniqued());
+  Metadata *NullOps[] = {nullptr};
+  ASSERT_EQ(N, MDTuple::get(Context, NullOps));
+}
+
+TEST_F(MDNodeTest, DistinctOnDeletedValueOperand) {
+  // i1* @GV
+  Type *Ty = Type::getInt1PtrTy(Context);
+  std::unique_ptr<GlobalVariable> GV(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  ConstantAsMetadata *Op = ConstantAsMetadata::get(GV.get());
+
+  // !{i1* @GV}
+  Metadata *Ops[] = {Op};
+  MDTuple *N = MDTuple::get(Context, Ops);
+
+  // !{i1* @GV} => !{null}
+  GV.reset();
+  ASSERT_TRUE(N->isDistinct());
+  ASSERT_EQ(nullptr, N->getOperand(0));
+  Metadata *NullOps[] = {nullptr};
+  ASSERT_NE(N, MDTuple::get(Context, NullOps));
+}
+
 TEST_F(MDNodeTest, getDistinct) {
   // !{}
   MDNode *Empty = MDNode::get(Context, None);
@@ -669,7 +703,7 @@ TEST_F(MDNodeTest, replaceWithUniquedResolvingOperand) {
   EXPECT_TRUE(N->isResolved());
 }
 
-TEST_F(MDNodeTest, replaceWithUniquedChangingOperand) {
+TEST_F(MDNodeTest, replaceWithUniquedDeletedOperand) {
   // i1* @GV
   Type *Ty = Type::getInt1PtrTy(Context);
   std::unique_ptr<GlobalVariable> GV(
@@ -686,8 +720,33 @@ TEST_F(MDNodeTest, replaceWithUniquedChangingOperand) {
 
   // !{i1* @GV} => !{null}
   GV.reset();
-  ASSERT_TRUE(N->isUniqued());
+  ASSERT_TRUE(N->isDistinct());
+  ASSERT_EQ(nullptr, N->getOperand(0));
   Metadata *NullOps[] = {nullptr};
+  ASSERT_NE(N, MDTuple::get(Context, NullOps));
+}
+
+TEST_F(MDNodeTest, replaceWithUniquedChangedOperand) {
+  // i1* @GV
+  Type *Ty = Type::getInt1PtrTy(Context);
+  std::unique_ptr<GlobalVariable> GV(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  ConstantAsMetadata *Op = ConstantAsMetadata::get(GV.get());
+
+  // temp !{i1* @GV}
+  Metadata *Ops[] = {Op};
+  MDTuple *N = MDTuple::getTemporary(Context, Ops).release();
+
+  // temp !{i1* @GV} => !{i1* @GV}
+  ASSERT_EQ(N, MDNode::replaceWithUniqued(TempMDTuple(N)));
+  ASSERT_TRUE(N->isUniqued());
+
+  // !{i1* @GV} => !{i1* @GV2}
+  std::unique_ptr<GlobalVariable> GV2(
+      new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage));
+  GV->replaceAllUsesWith(GV2.get());
+  ASSERT_TRUE(N->isUniqued());
+  Metadata *NullOps[] = {ConstantAsMetadata::get(GV2.get())};
   ASSERT_EQ(N, MDTuple::get(Context, NullOps));
 }
 
diff --git a/unittests/Support/IteratorTest.cpp b/unittests/Support/IteratorTest.cpp
index 8384832..63dfa2a 100644
--- a/unittests/Support/IteratorTest.cpp
+++ b/unittests/Support/IteratorTest.cpp
@@ -16,6 +16,24 @@ using namespace llvm;
 
 namespace {
 
+template <int> struct Shadow;
+
+struct WeirdIter : std::iterator<std::input_iterator_tag, Shadow<0>, Shadow<1>,
+                                 Shadow<2>, Shadow<3>> {};
+
+struct AdaptedIter : iterator_adaptor_base<AdaptedIter, WeirdIter> {};
+
+// Test that iterator_adaptor_base forwards typedefs, if value_type is
+// unchanged.
+static_assert(std::is_same<typename AdaptedIter::value_type, Shadow<0>>::value,
+              "");
+static_assert(
+    std::is_same<typename AdaptedIter::difference_type, Shadow<1>>::value, "");
+static_assert(std::is_same<typename AdaptedIter::pointer, Shadow<2>>::value,
+              "");
+static_assert(std::is_same<typename AdaptedIter::reference, Shadow<3>>::value,
+              "");
+
 TEST(PointeeIteratorTest, Basic) {
   int arr[4] = { 1, 2, 3, 4 };
   SmallVector<int *, 4> V;
@@ -98,4 +116,73 @@ TEST(PointeeIteratorTest, SmartPointer) {
   EXPECT_EQ(End, I);
 }
 
+TEST(FilterIteratorTest, Lambda) {
+  auto IsOdd = [](int N) { return N % 2 == 1; };
+  int A[] = {0, 1, 2, 3, 4, 5, 6};
+  auto Range = make_filter_range(A, IsOdd);
+  SmallVector<int, 3> Actual(Range.begin(), Range.end());
+  EXPECT_EQ((SmallVector<int, 3>{1, 3, 5}), Actual);
+}
+
+TEST(FilterIteratorTest, CallableObject) {
+  int Counter = 0;
+  struct Callable {
+    int &Counter;
+
+    Callable(int &Counter) : Counter(Counter) {}
+
+    bool operator()(int N) {
+      Counter++;
+      return N % 2 == 1;
+    }
+  };
+  Callable IsOdd(Counter);
+  int A[] = {0, 1, 2, 3, 4, 5, 6};
+  auto Range = make_filter_range(A, IsOdd);
+  EXPECT_EQ(2, Counter);
+  SmallVector<int, 3> Actual(Range.begin(), Range.end());
+  EXPECT_GE(Counter, 7);
+  EXPECT_EQ((SmallVector<int, 3>{1, 3, 5}), Actual);
+}
+
+TEST(FilterIteratorTest, FunctionPointer) {
+  bool (*IsOdd)(int) = [](int N) { return N % 2 == 1; };
+  int A[] = {0, 1, 2, 3, 4, 5, 6};
+  auto Range = make_filter_range(A, IsOdd);
+  SmallVector<int, 3> Actual(Range.begin(), Range.end());
+  EXPECT_EQ((SmallVector<int, 3>{1, 3, 5}), Actual);
+}
+
+TEST(FilterIteratorTest, Composition) {
+  auto IsOdd = [](int N) { return N % 2 == 1; };
+  std::unique_ptr<int> A[] = {make_unique<int>(0), make_unique<int>(1),
+                              make_unique<int>(2), make_unique<int>(3),
+                              make_unique<int>(4), make_unique<int>(5),
+                              make_unique<int>(6)};
+  using PointeeIterator = pointee_iterator<std::unique_ptr<int> *>;
+  auto Range = make_filter_range(
+      make_range(PointeeIterator(std::begin(A)), PointeeIterator(std::end(A))),
+      IsOdd);
+  SmallVector<int, 3> Actual(Range.begin(), Range.end());
+  EXPECT_EQ((SmallVector<int, 3>{1, 3, 5}), Actual);
+}
+
+TEST(FilterIteratorTest, InputIterator) {
+  struct InputIterator
+      : iterator_adaptor_base<InputIterator, int *, std::input_iterator_tag> {
+    using BaseT =
+        iterator_adaptor_base<InputIterator, int *, std::input_iterator_tag>;
+
+    InputIterator(int *It) : BaseT(It) {}
+  };
+
+  auto IsOdd = [](int N) { return N % 2 == 1; };
+  int A[] = {0, 1, 2, 3, 4, 5, 6};
+  auto Range = make_filter_range(
+      make_range(InputIterator(std::begin(A)), InputIterator(std::end(A))),
+      IsOdd);
+  SmallVector<int, 3> Actual(Range.begin(), Range.end());
+  EXPECT_EQ((SmallVector<int, 3>{1, 3, 5}), Actual);
+}
+
 } // anonymous namespace
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 37af976..b9cc38d 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -38,7 +38,6 @@ do_test_suite="yes"
 do_openmp="yes"
 do_lldb="no"
 BuildDir="`pwd`"
-use_autoconf="no"
 ExtraConfigureFlags=""
 ExportBranch=""
 
@@ -57,7 +56,6 @@ function usage() {
     echo " -no-compare-files    Don't test that phase 2 and 3 files are identical."
     echo " -use-gzip            Use gzip instead of xz."
     echo " -configure-flags FLAGS  Extra flags to pass to the configure step."
-    echo " -use-autoconf        Use autoconf instead of cmake"
     echo " -svn-path DIR        Use the specified DIR instead of a release."
     echo "                      For example -svn-path trunk or -svn-path branches/release_37"
     echo " -no-rt               Disable check-out & build Compiler-RT"
@@ -127,9 +125,6 @@ while [ $# -gt 0 ]; do
         -use-gzip | --use-gzip )
             use_gzip="yes"
             ;;
-        -use-autoconf | --use-autoconf )
-            use_autoconf="yes"
-            ;;
         -no-rt )
             do_rt="no"
             ;;
@@ -164,13 +159,11 @@ while [ $# -gt 0 ]; do
     shift
 done
 
-if [ "$use_autoconf" = "no" ]; then
-  if [ "$do_test_suite" = "yes" ]; then
-    # See llvm.org/PR26146.
-    echo Skipping test-suite build when using CMake.
-    echo It will still be exported.
-    do_test_suite="export-only"
-  fi
+if [ "$do_test_suite" = "yes" ]; then
+  # See llvm.org/PR26146.
+  echo Skipping test-suite build when using CMake.
+  echo It will still be exported.
+  do_test_suite="export-only"
 fi
 
 # Check required arguments.
@@ -337,17 +330,14 @@ function configure_llvmCore() {
         Release )
             BuildType="Release"
             Assertions="OFF"
-            ConfigureFlags="--enable-optimized --disable-assertions"
             ;;
         Release+Asserts )
             BuildType="Release"
             Assertions="ON"
-            ConfigureFlags="--enable-optimized --enable-assertions"
             ;;
         Debug )
             BuildType="Debug"
             Assertions="ON"
-            ConfigureFlags="--disable-optimized --enable-assertions"
             ;;
         * )
             echo "# Invalid flavor '$Flavor'"
@@ -362,29 +352,18 @@ function configure_llvmCore() {
     cd $ObjDir
     echo "# Configuring llvm $Release-$RC $Flavor"
 
-    if [ "$use_autoconf" = "yes" ]; then
-        echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
-            $BuildDir/llvm.src/configure \
-            $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \
-            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
-        env CC="$c_compiler" CXX="$cxx_compiler" \
-            $BuildDir/llvm.src/configure \
-            $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \
-            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
-    else
-        echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
-            cmake -G "Unix Makefiles" \
-            -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
-            -DLLVM_CONFIGTIME="(timestamp not enabled)" \
-            $ExtraConfigureFlags $BuildDir/llvm.src \
-            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
-        env CC="$c_compiler" CXX="$cxx_compiler" \
-            cmake -G "Unix Makefiles" \
-            -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
-            -DLLVM_CONFIGTIME="(timestamp not enabled)" \
-            $ExtraConfigureFlags $BuildDir/llvm.src \
-            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
-    fi
+    echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
+        cmake -G "Unix Makefiles" \
+        -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
+        -DLLVM_CONFIGTIME="(timestamp not enabled)" \
+        $ExtraConfigureFlags $BuildDir/llvm.src \
+        2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+    env CC="$c_compiler" CXX="$cxx_compiler" \
+        cmake -G "Unix Makefiles" \
+        -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
+        -DLLVM_CONFIGTIME="(timestamp not enabled)" \
+        $ExtraConfigureFlags $BuildDir/llvm.src \
+        2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
 
     cd $BuildDir
 }
@@ -420,14 +399,6 @@ function test_llvmCore() {
       deferred_error $Phase $Flavor "check-all failed"
     fi
 
-    if [ "$use_autoconf" = "yes" ]; then
-        # In the cmake build, unit tests are run as part of check-all.
-        if ! ( ${MAKE} -k unittests 2>&1 | \
-            tee $LogDir/llvm.unittests-Phase$Phase-$Flavor.log ) ; then
-          deferred_error $Phase $Flavor "unittests failed"
-        fi
-    fi
-
     cd $BuildDir
 }