From a7fe922b98bb45be7dce7c1cfe668ec27eeddc74 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Aug 17 2016 19:33:52 +0000 Subject: Vendor import of llvm release_39 branch r278877: https://llvm.org/svn/llvm-project/llvm/branches/release_39@278877 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index f102424..0393150 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,6 +293,7 @@ endif() option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF) option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF) option(LLVM_ENABLE_LIBCXXABI "Use libc++abi when using libc++." OFF) +option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF) option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON) option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF) diff --git a/LICENSE.TXT b/LICENSE.TXT index 8b1585d..555c8bb 100644 --- a/LICENSE.TXT +++ b/LICENSE.TXT @@ -61,8 +61,6 @@ licenses, and/or restrictions: Program Directory ------- --------- -Autoconf llvm/autoconf - llvm/projects/ModuleMaker/autoconf Google Test llvm/utils/unittest/googletest OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 22b4408..a0a7995 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -144,6 +144,12 @@ function(add_flag_or_print_warning flag name) endif() endfunction() +if(LLVM_ENABLE_LLD) + check_cxx_compiler_flag("-fuse-ld=lld" CXX_SUPPORTS_LLD) + append_if(CXX_SUPPORTS_LLD "-fuse-ld=lld" + CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS) +endif() + if( LLVM_ENABLE_PIC ) if( XCODE ) # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst index 6a54343..2f5a27c 100644 --- a/docs/CodeGenerator.rst +++ b/docs/CodeGenerator.rst @@ -436,7 +436,7 @@ For example, consider this simple LLVM example: The X86 instruction selector might produce this machine code for the ``div`` and ``ret``: -.. code-block:: llvm +.. code-block:: text ;; Start of div %EAX = mov %reg1024 ;; Copy X (in reg1024) into EAX @@ -453,7 +453,7 @@ By the end of code generation, the register allocator would coalesce the registers and delete the resultant identity moves producing the following code: -.. code-block:: llvm +.. code-block:: text ;; X is in EAX, Y is in ECX mov %EAX, %EDX @@ -965,7 +965,7 @@ target code. For example, consider the following LLVM fragment: This LLVM code corresponds to a SelectionDAG that looks basically like this: -.. code-block:: llvm +.. code-block:: text (fadd:f32 (fmul:f32 (fadd:f32 W, X), Y), Z) diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst index a0ca1bf..413b6f4 100644 --- a/docs/CommandGuide/FileCheck.rst +++ b/docs/CommandGuide/FileCheck.rst @@ -144,7 +144,7 @@ exists anywhere in the file. The FileCheck -check-prefix option ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The FileCheck :option:`-check-prefix` option allows multiple test +The FileCheck `-check-prefix` option allows multiple test configurations to be driven from one `.ll` file. This is useful in many circumstances, for example, testing different architectural variants with :program:`llc`. Here's a simple example: @@ -303,7 +303,7 @@ be aware that the definition rule can match `after` its use. So, for instance, the code below will pass: -.. code-block:: llvm +.. code-block:: text ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0] ; CHECK-DAG: vmov.32 [[REG2]][1] @@ -312,7 +312,7 @@ So, for instance, the code below will pass: While this other code, will not: -.. code-block:: llvm +.. code-block:: text ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0] ; CHECK-DAG: vmov.32 [[REG2]][1] @@ -473,7 +473,7 @@ To match newline characters in regular expressions the character class matches output of the form (from llvm-dwarfdump): -.. code-block:: llvm +.. code-block:: text DW_AT_location [DW_FORM_sec_offset] (0x00000233) DW_AT_name [DW_FORM_strp] ( .debug_str[0x000000c9] = "intd") diff --git a/docs/CommandGuide/llvm-nm.rst b/docs/CommandGuide/llvm-nm.rst index f666e1c..319e6e6 100644 --- a/docs/CommandGuide/llvm-nm.rst +++ b/docs/CommandGuide/llvm-nm.rst @@ -68,11 +68,11 @@ OPTIONS .. option:: -B (default) - Use BSD output format. Alias for :option:`--format=bsd`. + Use BSD output format. Alias for `--format=bsd`. .. option:: -P - Use POSIX.2 output format. Alias for :option:`--format=posix`. + Use POSIX.2 output format. Alias for `--format=posix`. .. option:: --debug-syms, -a diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst index 3a050f7..7b9255d 100644 --- a/docs/CommandGuide/opt.rst +++ b/docs/CommandGuide/opt.rst @@ -12,16 +12,16 @@ DESCRIPTION The :program:`opt` command is the modular LLVM optimizer and analyzer. It takes LLVM source files as input, runs the specified optimizations or analyses on it, and then outputs the optimized file or the analysis results. The -function of :program:`opt` depends on whether the :option:`-analyze` option is +function of :program:`opt` depends on whether the `-analyze` option is given. -When :option:`-analyze` is specified, :program:`opt` performs various analyses +When `-analyze` is specified, :program:`opt` performs various analyses of the input source. It will usually print the results on standard output, but in a few cases, it will print output to standard error or generate a file with the analysis output, which is usually done when the output is meant for another program. -While :option:`-analyze` is *not* given, :program:`opt` attempts to produce an +While `-analyze` is *not* given, :program:`opt` attempts to produce an optimized output file. The optimizations available via :program:`opt` depend upon what libraries were linked into it as well as any additional libraries that have been loaded with the :option:`-load` option. Use the :option:`-help` @@ -68,19 +68,19 @@ OPTIONS .. option:: -disable-opt - This option is only meaningful when :option:`-std-link-opts` is given. It + This option is only meaningful when `-std-link-opts` is given. It disables most passes. .. option:: -strip-debug This option causes opt to strip debug information from the module before - applying other optimizations. It is essentially the same as :option:`-strip` + applying other optimizations. It is essentially the same as `-strip` but it ensures that stripping of debug information is done first. .. option:: -verify-each This option causes opt to add a verify pass after every pass otherwise - specified on the command line (including :option:`-verify`). This is useful + specified on the command line (including `-verify`). This is useful for cases where it is suspected that a pass is creating an invalid module but it is not clear which pass is doing it. diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst index 41dd4b6..a44fb92 100644 --- a/docs/ExceptionHandling.rst +++ b/docs/ExceptionHandling.rst @@ -406,7 +406,7 @@ outlined. After the handler is outlined, this intrinsic is simply removed. ``llvm.eh.exceptionpointer`` ---------------------------- -.. code-block:: llvm +.. code-block:: text i8 addrspace(N)* @llvm.eh.padparam.pNi8(token %catchpad) @@ -427,7 +427,7 @@ backend. Uses of them are generated by the backend's ``llvm.eh.sjlj.setjmp`` ~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: llvm +.. code-block:: text i32 @llvm.eh.sjlj.setjmp(i8* %setjmp_buf) @@ -664,7 +664,7 @@ all of the new IR instructions: return 0; } -.. code-block:: llvm +.. code-block:: text define i32 @f() nounwind personality i32 (...)* @__CxxFrameHandler3 { entry: @@ -741,7 +741,7 @@ C++ code: } } -.. code-block:: llvm +.. code-block:: text define void @f() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { entry: diff --git a/docs/Extensions.rst b/docs/Extensions.rst index c8ff07c..f702921 100644 --- a/docs/Extensions.rst +++ b/docs/Extensions.rst @@ -43,7 +43,7 @@ The following additional relocation types are supported: corresponds to the COFF relocation types ``IMAGE_REL_I386_DIR32NB`` (32-bit) or ``IMAGE_REL_AMD64_ADDR32NB`` (64-bit). -.. code-block:: gas +.. code-block:: text .text fun: diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst index 56b4b9f..81605bc 100644 --- a/docs/GarbageCollection.rst +++ b/docs/GarbageCollection.rst @@ -204,7 +204,7 @@ IR features is specified by the selected :ref:`GC strategy description Specifying GC code generation: ``gc "..."`` ------------------------------------------- -.. code-block:: llvm +.. code-block:: text define @name(...) gc "name" { ... } diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst index c9cfae6..f39f1d9 100644 --- a/docs/GetElementPtr.rst +++ b/docs/GetElementPtr.rst @@ -105,7 +105,7 @@ memory, or a global variable. To make this clear, let's consider a more obtuse example: -.. code-block:: llvm +.. code-block:: text %MyVar = uninitialized global i32 ... @@ -142,7 +142,7 @@ Quick answer: there are no superfluous indices. This question arises most often when the GEP instruction is applied to a global variable which is always a pointer type. For example, consider this: -.. code-block:: llvm +.. code-block:: text %MyStruct = uninitialized global { float*, i32 } ... @@ -178,7 +178,7 @@ The GetElementPtr instruction dereferences nothing. That is, it doesn't access memory in any way. That's what the Load and Store instructions are for. GEP is only involved in the computation of addresses. For example, consider this: -.. code-block:: llvm +.. code-block:: text %MyVar = uninitialized global { [40 x i32 ]* } ... @@ -195,7 +195,7 @@ illegal. In order to access the 18th integer in the array, you would need to do the following: -.. code-block:: llvm +.. code-block:: text %idx = getelementptr { [40 x i32]* }, { [40 x i32]* }* %, i64 0, i32 0 %arr = load [40 x i32]** %idx @@ -204,7 +204,7 @@ following: In this case, we have to load the pointer in the structure with a load instruction before we can index into the array. If the example was changed to: -.. code-block:: llvm +.. code-block:: text %MyVar = uninitialized global { [40 x i32 ] } ... diff --git a/docs/HowToUseInstrMappings.rst b/docs/HowToUseInstrMappings.rst index 8a3e7c8..1c586b4 100755 --- a/docs/HowToUseInstrMappings.rst +++ b/docs/HowToUseInstrMappings.rst @@ -30,7 +30,7 @@ instructions with each other. These tables are emitted in the ``XXXInstrInfo.inc`` file along with the functions to query them. Following is the definition of ``InstrMapping`` class definied in Target.td file: -.. code-block:: llvm +.. code-block:: text class InstrMapping { // Used to reduce search space only to the instructions using this @@ -69,7 +69,7 @@ non-predicated form by assigning appropriate values to the ``InstrMapping`` fields. For this relationship, non-predicated instructions are treated as key instruction since they are the one used to query the interface function. -.. code-block:: llvm +.. code-block:: text def getPredOpcode : InstrMapping { // Choose a FilterClass that is used as a base class for all the @@ -116,7 +116,7 @@ to include relevant information in its definition. For example, consider following to be the current definitions of ADD, ADD_pt (true) and ADD_pf (false) instructions: -.. code-block:: llvm +.. code-block:: text def ADD : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b), "$dst = add($a, $b)", @@ -137,7 +137,7 @@ In this step, we modify these instructions to include the information required by the relationship model, getPredOpcode, so that they can be related. -.. code-block:: llvm +.. code-block:: text def ADD : PredRel, ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b), "$dst = add($a, $b)", diff --git a/docs/InAlloca.rst b/docs/InAlloca.rst index c7609cd..a75f22d 100644 --- a/docs/InAlloca.rst +++ b/docs/InAlloca.rst @@ -41,7 +41,7 @@ that passes two default-constructed ``Foo`` objects to ``g`` in the g(Foo(), Foo()); } -.. code-block:: llvm +.. code-block:: text %struct.Foo = type { i32, i32 } declare void @Foo_ctor(%struct.Foo* %this) diff --git a/docs/LangRef.rst b/docs/LangRef.rst index f6dda59..ce15c47 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -839,7 +839,7 @@ Note that the Mach-O platform doesn't support COMDATs and ELF only supports Here is an example of a COMDAT group where a function will only be selected if the COMDAT key's section is the largest: -.. code-block:: llvm +.. code-block:: text $foo = comdat largest @foo = global i32 2, comdat($foo) @@ -851,7 +851,7 @@ the COMDAT key's section is the largest: As a syntactic sugar the ``$name`` can be omitted if the name is the same as the global name: -.. code-block:: llvm +.. code-block:: text $foo = comdat any @foo = global i32 2, comdat @@ -875,7 +875,7 @@ if a collision occurs in the symbol table. The combined use of COMDATS and section attributes may yield surprising results. For example: -.. code-block:: llvm +.. code-block:: text $foo = comdat any $bar = comdat any @@ -1205,7 +1205,7 @@ makes the format of the prologue data highly target dependent. A trivial example of valid prologue data for the x86 architecture is ``i8 144``, which encodes the ``nop`` instruction: -.. code-block:: llvm +.. code-block:: text define void @f() prologue i8 144 { ... } @@ -1213,7 +1213,7 @@ Generally prologue data can be formed by encoding a relative branch instruction which skips the metadata, as in this example of valid prologue data for the x86_64 architecture, where the first two bytes encode ``jmp .+10``: -.. code-block:: llvm +.. code-block:: text %0 = type <{ i8, i8, i8* }> @@ -2237,7 +2237,7 @@ source file name to the local function name. The syntax for the source file name is simply: -.. code-block:: llvm +.. code-block:: text source_filename = "/path/to/source.c" @@ -2847,7 +2847,7 @@ cleared low bit. However, in the ``%C`` example, the optimizer is allowed to assume that the '``undef``' operand could be the same as ``%Y``, allowing the whole '``select``' to be eliminated. -.. code-block:: llvm +.. code-block:: text %A = xor undef, undef @@ -2899,7 +2899,7 @@ does not execute at all. This allows us to delete the divide and all code after it. Because the undefined operation "can't happen", the optimizer can assume that it occurs in dead code. -.. code-block:: llvm +.. code-block:: text a: store undef -> %X b: store %X -> undef @@ -3884,7 +3884,7 @@ their operand. For example: Metadata nodes that aren't uniqued use the ``distinct`` keyword. For example: -.. code-block:: llvm +.. code-block:: text !0 = distinct !{!"test\00", i32 10} @@ -3949,7 +3949,7 @@ fields are tuples containing the debug info to be emitted along with the compile unit, regardless of code optimizations (some nodes are only emitted if there are references to them from instructions). -.. code-block:: llvm +.. code-block:: text !0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, @@ -3985,7 +3985,7 @@ DIBasicType ``DIBasicType`` nodes represent primitive types, such as ``int``, ``bool`` and ``float``. ``tag:`` defaults to ``DW_TAG_base_type``. -.. code-block:: llvm +.. code-block:: text !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char) @@ -3994,7 +3994,7 @@ DIBasicType The ``encoding:`` describes the details of the type. Usually it's one of the following: -.. code-block:: llvm +.. code-block:: text DW_ATE_address = 1 DW_ATE_boolean = 2 @@ -4014,7 +4014,7 @@ refers to a tuple; the first operand is the return type, while the rest are the types of the formal arguments in order. If the first operand is ``null``, that represents a function with no return value (such as ``void foo() {}`` in C++). -.. code-block:: llvm +.. code-block:: text !0 = !BasicType(name: "int", size: 32, align: 32, DW_ATE_signed) !1 = !BasicType(name: "char", size: 8, align: 8, DW_ATE_signed_char) @@ -4028,7 +4028,7 @@ DIDerivedType ``DIDerivedType`` nodes represent types derived from other types, such as qualified types. -.. code-block:: llvm +.. code-block:: text !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char) @@ -4037,7 +4037,7 @@ qualified types. The following ``tag:`` values are valid: -.. code-block:: llvm +.. code-block:: text DW_TAG_member = 13 DW_TAG_pointer_type = 15 @@ -4089,7 +4089,7 @@ does not have ``flags: DIFlagFwdDecl`` set. LLVM tools that link modules together will unique such definitions at parse time via the ``identifier:`` field, even if the nodes are ``distinct``. -.. code-block:: llvm +.. code-block:: text !0 = !DIEnumerator(name: "SixKind", value: 7) !1 = !DIEnumerator(name: "SevenKind", value: 7) @@ -4100,7 +4100,7 @@ field, even if the nodes are ``distinct``. The following ``tag:`` values are valid: -.. code-block:: llvm +.. code-block:: text DW_TAG_array_type = 1 DW_TAG_class_type = 2 @@ -4219,7 +4219,7 @@ type with an ODR ``identifier:`` and that does not set ``flags: DIFwdDecl``, then the subprogram declaration is uniqued based only on its ``linkageName:`` and ``scope:``. -.. code-block:: llvm +.. code-block:: text define void @_Z3foov() !dbg !0 { ... @@ -4244,7 +4244,7 @@ DILexicalBlock two lexical blocks at same depth. They are valid targets for ``scope:`` fields. -.. code-block:: llvm +.. code-block:: text !0 = distinct !DILexicalBlock(scope: !1, file: !2, line: 7, column: 35) @@ -4290,7 +4290,7 @@ the ``arg:`` field is set to non-zero, then this variable is a subprogram parameter, and it will be included in the ``variables:`` field of its :ref:`DISubprogram`. -.. code-block:: llvm +.. code-block:: text !0 = !DILocalVariable(name: "this", arg: 1, scope: !3, file: !2, line: 7, type: !3, flags: DIFlagArtificial) @@ -4313,7 +4313,7 @@ The current supported vocabulary is limited: - ``DW_OP_bit_piece, 16, 8`` specifies the offset and size (``16`` and ``8`` here, respectively) of the variable piece from the working expression. -.. code-block:: llvm +.. code-block:: text !0 = !DIExpression(DW_OP_deref) !1 = !DIExpression(DW_OP_plus, 3) @@ -4336,7 +4336,7 @@ DIImportedEntity ``DIImportedEntity`` nodes represent entities (such as modules) imported into a compile unit. -.. code-block:: llvm +.. code-block:: text !2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, line: 7) @@ -4349,7 +4349,7 @@ The ``name:`` field is the macro identifier, followed by macro parameters when defining a function-like macro, and the ``value`` field is the token-string used to expand the macro identifier. -.. code-block:: llvm +.. code-block:: text !2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)", value: "((x) + 1)") @@ -4362,7 +4362,7 @@ DIMacroFile The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that appear in the included source file. -.. code-block:: llvm +.. code-block:: text !2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2, nodes: !3) @@ -5660,7 +5660,7 @@ block. Therefore, it must be the only non-phi instruction in the block. Example: """""""" -.. code-block:: llvm +.. code-block:: text dispatch1: %cs1 = catchswitch within none [label %handler0, label %handler1] unwind to caller @@ -5711,7 +5711,7 @@ the ``catchret``'s behavior is undefined. Example: """""""" -.. code-block:: llvm +.. code-block:: text catchret from %catch label %continue @@ -5761,7 +5761,7 @@ It transfers control to ``continue`` or unwinds out of the function. Example: """""""" -.. code-block:: llvm +.. code-block:: text cleanupret from %cleanup unwind to caller cleanupret from %cleanup unwind label %continue @@ -5851,7 +5851,7 @@ unsigned and/or signed overflow, respectively, occurs. Example: """""""" -.. code-block:: llvm +.. code-block:: text = add i32 4, %var ; yields i32:result = 4 + %var @@ -5890,7 +5890,7 @@ optimizations: Example: """""""" -.. code-block:: llvm +.. code-block:: text = fadd float 4.0, %var ; yields float:result = 4.0 + %var @@ -5942,7 +5942,7 @@ unsigned and/or signed overflow, respectively, occurs. Example: """""""" -.. code-block:: llvm +.. code-block:: text = sub i32 4, %var ; yields i32:result = 4 - %var = sub i32 0, %val ; yields i32:result = -%var @@ -5985,7 +5985,7 @@ unsafe floating point optimizations: Example: """""""" -.. code-block:: llvm +.. code-block:: text = fsub float 4.0, %var ; yields float:result = 4.0 - %var = fsub float -0.0, %val ; yields float:result = -%var @@ -6039,7 +6039,7 @@ unsigned and/or signed overflow, respectively, occurs. Example: """""""" -.. code-block:: llvm +.. code-block:: text = mul i32 4, %var ; yields i32:result = 4 * %var @@ -6078,7 +6078,7 @@ unsafe floating point optimizations: Example: """""""" -.. code-block:: llvm +.. code-block:: text = fmul float 4.0, %var ; yields float:result = 4.0 * %var @@ -6122,7 +6122,7 @@ such, "((a udiv exact b) mul b) == a"). Example: """""""" -.. code-block:: llvm +.. code-block:: text = udiv i32 4, %var ; yields i32:result = 4 / %var @@ -6168,7 +6168,7 @@ a :ref:`poison value ` if the result would be rounded. Example: """""""" -.. code-block:: llvm +.. code-block:: text = sdiv i32 4, %var ; yields i32:result = 4 / %var @@ -6207,7 +6207,7 @@ unsafe floating point optimizations: Example: """""""" -.. code-block:: llvm +.. code-block:: text = fdiv float 4.0, %var ; yields float:result = 4.0 / %var @@ -6249,7 +6249,7 @@ Taking the remainder of a division by zero leads to undefined behavior. Example: """""""" -.. code-block:: llvm +.. code-block:: text = urem i32 4, %var ; yields i32:result = 4 % %var @@ -6304,7 +6304,7 @@ result of the division and the remainder.) Example: """""""" -.. code-block:: llvm +.. code-block:: text = srem i32 4, %var ; yields i32:result = 4 % %var @@ -6344,7 +6344,7 @@ to enable otherwise unsafe floating point optimizations: Example: """""""" -.. code-block:: llvm +.. code-block:: text = frem float 4.0, %var ; yields float:result = 4.0 % %var @@ -6406,7 +6406,7 @@ nsw/nuw bits in (mul %op1, (shl 1, %op2)). Example: """""""" -.. code-block:: llvm +.. code-block:: text = shl i32 4, %var ; yields i32: 4 << %var = shl i32 4, 2 ; yields i32: 16 @@ -6455,7 +6455,7 @@ non-zero. Example: """""""" -.. code-block:: llvm +.. code-block:: text = lshr i32 4, 1 ; yields i32:result = 2 = lshr i32 4, 2 ; yields i32:result = 1 @@ -6506,7 +6506,7 @@ non-zero. Example: """""""" -.. code-block:: llvm +.. code-block:: text = ashr i32 4, 1 ; yields i32:result = 2 = ashr i32 4, 2 ; yields i32:result = 1 @@ -6558,7 +6558,7 @@ The truth table used for the '``and``' instruction is: Example: """""""" -.. code-block:: llvm +.. code-block:: text = and i32 4, %var ; yields i32:result = 4 & %var = and i32 15, 40 ; yields i32:result = 8 @@ -6657,7 +6657,7 @@ The truth table used for the '``xor``' instruction is: Example: """""""" -.. code-block:: llvm +.. code-block:: text = xor i32 4, %var ; yields i32:result = 4 ^ %var = xor i32 15, 40 ; yields i32:result = 39 @@ -6710,7 +6710,7 @@ exceeds the length of ``val``, the results are undefined. Example: """""""" -.. code-block:: llvm +.. code-block:: text = extractelement <4 x i32> %vec, i32 0 ; yields i32 @@ -6752,7 +6752,7 @@ undefined. Example: """""""" -.. code-block:: llvm +.. code-block:: text = insertelement <4 x i32> %vec, i32 1, i32 0 ; yields <4 x i32> @@ -6800,7 +6800,7 @@ only one vector. Example: """""""" -.. code-block:: llvm +.. code-block:: text = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> ; yields <4 x i32> @@ -6859,7 +6859,7 @@ the index operands. Example: """""""" -.. code-block:: llvm +.. code-block:: text = extractvalue {i32, float} %agg, 0 ; yields i32 @@ -8126,7 +8126,7 @@ or :ref:`ptrtoint ` instructions first. Example: """""""" -.. code-block:: llvm +.. code-block:: text %X = bitcast i8 255 to i8 ; yields i8 :-1 %Y = bitcast i32* %x to sint* ; yields sint*:%x @@ -8265,7 +8265,7 @@ as the values being compared. Otherwise, the result is an ``i1``. Example: """""""" -.. code-block:: llvm +.. code-block:: text = icmp eq i32 4, 5 ; yields: result=false = icmp ne float* %X, %X ; yields: result=false @@ -8379,7 +8379,7 @@ assumptions to be made about the values of input arguments; namely Example: """""""" -.. code-block:: llvm +.. code-block:: text = fcmp oeq float 4.0, 5.0 ; yields: result=false = fcmp one float 4.0, 5.0 ; yields: result=true @@ -8815,7 +8815,7 @@ that does not carry an appropriate :ref:`"funclet" bundle `. Example: """""""" -.. code-block:: llvm +.. code-block:: text dispatch: %cs = catchswitch within none [label %handler0] unwind to caller @@ -8885,7 +8885,7 @@ that does not carry an appropriate :ref:`"funclet" bundle `. Example: """""""" -.. code-block:: llvm +.. code-block:: text %tok = cleanuppad within %cs [] @@ -12481,19 +12481,19 @@ optimistic assumptions made during compilation. The semantics of ``@llvm.experimental.deoptimize`` -- its body is defined to be equivalent to: -.. code-block:: llvm +.. code-block:: text - define void @llvm.experimental.guard(i1 %pred, ) { - %realPred = and i1 %pred, undef - br i1 %realPred, label %continue, label %leave [, !make.implicit !{}] + define void @llvm.experimental.guard(i1 %pred, ) { + %realPred = and i1 %pred, undef + br i1 %realPred, label %continue, label %leave [, !make.implicit !{}] - leave: - call void @llvm.experimental.deoptimize() [ "deopt"() ] - ret void + leave: + call void @llvm.experimental.deoptimize() [ "deopt"() ] + ret void - continue: - ret void - } + continue: + ret void + } with the optional ``[, !make.implicit !{}]`` present if and only if it diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst index a5f8c8c..f6ee6cc 100644 --- a/docs/MIRLangRef.rst +++ b/docs/MIRLangRef.rst @@ -111,7 +111,6 @@ Here is an example of a YAML document that contains an LLVM module: .. code-block:: llvm - --- | define i32 @inc(i32* %x) { entry: %0 = load i32, i32* %x @@ -119,7 +118,6 @@ Here is an example of a YAML document that contains an LLVM module: store i32 %1, i32* %x ret i32 %1 } - ... .. _YAML block literal string: http://www.yaml.org/spec/1.2/spec.html#id2795688 @@ -129,7 +127,7 @@ Machine Functions The remaining YAML documents contain the machine functions. This is an example of such YAML document: -.. code-block:: llvm +.. code-block:: text --- name: inc @@ -172,7 +170,7 @@ A machine basic block is defined in a single block definition source construct that contains the block's ID. The example below defines two blocks that have an ID of zero and one: -.. code-block:: llvm +.. code-block:: text bb.0: @@ -182,7 +180,7 @@ The example below defines two blocks that have an ID of zero and one: A machine basic block can also have a name. It should be specified after the ID in the block's definition: -.. code-block:: llvm +.. code-block:: text bb.0.entry: ; This block's name is "entry" @@ -196,7 +194,7 @@ Block References The machine basic blocks are identified by their ID numbers. Individual blocks are referenced using the following syntax: -.. code-block:: llvm +.. code-block:: text %bb.[.] @@ -213,7 +211,7 @@ Successors The machine basic block's successors have to be specified before any of the instructions: -.. code-block:: llvm +.. code-block:: text bb.0.entry: successors: %bb.1.then, %bb.2.else @@ -227,7 +225,7 @@ The branch weights can be specified in brackets after the successor blocks. The example below defines a block that has two successors with branch weights of 32 and 16: -.. code-block:: llvm +.. code-block:: text bb.0.entry: successors: %bb.1.then(32), %bb.2.else(16) @@ -240,7 +238,7 @@ Live In Registers The machine basic block's live in registers have to be specified before any of the instructions: -.. code-block:: llvm +.. code-block:: text bb.0.entry: liveins: %edi, %esi @@ -255,7 +253,7 @@ Miscellaneous Attributes The attributes ``IsAddressTaken``, ``IsLandingPad`` and ``Alignment`` can be specified in brackets after the block's definition: -.. code-block:: llvm +.. code-block:: text bb.0.entry (address-taken): @@ -278,7 +276,7 @@ The instruction's name is usually specified before the operands. The example below shows an instance of the X86 ``RETQ`` instruction with a single machine operand: -.. code-block:: llvm +.. code-block:: text RETQ %eax @@ -287,7 +285,7 @@ operands, the instruction's name has to be specified after them. The example below shows an instance of the AArch64 ``LDPXpost`` instruction with three defined register operands: -.. code-block:: llvm +.. code-block:: text %sp, %fp, %lr = LDPXpost %sp, 2 @@ -303,7 +301,7 @@ Instruction Flags The flag ``frame-setup`` can be specified before the instruction's name: -.. code-block:: llvm +.. code-block:: text %fp = frame-setup ADDXri %sp, 0, 0 @@ -321,13 +319,13 @@ but they can also be used in a number of other places, like the The physical registers are identified by their name. They use the following syntax: -.. code-block:: llvm +.. code-block:: text % The example below shows three X86 physical registers: -.. code-block:: llvm +.. code-block:: text %eax %r15 @@ -336,13 +334,13 @@ The example below shows three X86 physical registers: The virtual registers are identified by their ID number. They use the following syntax: -.. code-block:: llvm +.. code-block:: text % Example: -.. code-block:: llvm +.. code-block:: text %0 @@ -366,7 +364,7 @@ The immediate machine operands are untyped, 64-bit signed integers. The example below shows an instance of the X86 ``MOV32ri`` instruction that has an immediate machine operand ``-42``: -.. code-block:: llvm +.. code-block:: text %eax = MOV32ri -42 @@ -384,14 +382,14 @@ machine operands. The register operands can also have optional and a reference to the tied register operand. The full syntax of a register operand is shown below: -.. code-block:: llvm +.. code-block:: text [] [ : ] [ (tied-def ) ] This example shows an instance of the X86 ``XOR32rr`` instruction that has 5 register operands with different register flags: -.. code-block:: llvm +.. code-block:: text dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al @@ -446,7 +444,7 @@ the subregister indices. The example below shows an instance of the ``COPY`` pseudo instruction that uses the X86 ``sub_8bit`` subregister index to copy 8 lower bits from the 32-bit virtual register 0 to the 8-bit virtual register 1: -.. code-block:: llvm +.. code-block:: text %1 = COPY %0:sub_8bit @@ -461,7 +459,7 @@ The global value machine operands reference the global values from the The example below shows an instance of the X86 ``MOV64rm`` instruction that has a global value operand named ``G``: -.. code-block:: llvm +.. code-block:: text %rax = MOV64rm %rip, 1, _, @G, _ diff --git a/docs/MarkedUpDisassembly.rst b/docs/MarkedUpDisassembly.rst index cc4dbc8..df8befe 100644 --- a/docs/MarkedUpDisassembly.rst +++ b/docs/MarkedUpDisassembly.rst @@ -70,7 +70,7 @@ clients. For example, a possible annotation of an ARM load of a stack-relative location might be annotated as: -.. code-block:: nasm +.. code-block:: text ldr , , ]> diff --git a/docs/MergeFunctions.rst b/docs/MergeFunctions.rst index f808010..b87cea6 100644 --- a/docs/MergeFunctions.rst +++ b/docs/MergeFunctions.rst @@ -394,7 +394,7 @@ and in right function "*FR*". And every part of *left* place is equal to the corresponding part of *right* place, and (!) both parts use *Value* instances, for example: -.. code-block:: llvm +.. code-block:: text instr0 i32 %LV ; left side, function FL instr0 i32 %RV ; right side, function FR @@ -409,13 +409,13 @@ in "*FL*" and "*FR*". Consider small example here: -.. code-block:: llvm +.. code-block:: text define void %f(i32 %pf0, i32 %pf1) { instr0 i32 %pf0 instr1 i32 %pf1 instr2 i32 123 } -.. code-block:: llvm +.. code-block:: text define void %g(i32 %pg0, i32 %pg1) { instr0 i32 %pg0 instr1 i32 %pg0 instr2 i32 123 diff --git a/docs/NVPTXUsage.rst b/docs/NVPTXUsage.rst index 8b8c40f..fdfc8e4 100644 --- a/docs/NVPTXUsage.rst +++ b/docs/NVPTXUsage.rst @@ -37,7 +37,7 @@ code. By default, the back-end will emit device functions. Metadata is used to declare a function as a kernel function. This metadata is attached to the ``nvvm.annotations`` named metadata object, and has the following format: -.. code-block:: llvm +.. code-block:: text !0 = !{, metadata !"kernel", i32 1} diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 54f2d53..dc76617 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -40,7 +40,10 @@ Non-comprehensive list of changes in this release * There is no longer a "global context" available in LLVM, except for the C API. -* .. note about autoconf build having been removed. +* The autoconf build system has been removed in favor of CMake. LLVM 3.9 + requires CMake 3.4.3 or later to build. For information about using CMake + please see the documentation on :doc:`CMake`. For information about the CMake + language there is also a :doc:`CMakePrimer` document available. * .. note about C API functions LLVMParseBitcode, LLVMParseBitcodeInContext, LLVMGetBitcodeModuleInContext and @@ -69,11 +72,13 @@ Non-comprehensive list of changes in this release need to be updated to replace the argument node and remove any dead nodes in cases where they currently return an ``SDNode *`` from this interface. -* Introduction of ThinLTO: [FIXME: needs to be documented more extensively in - /docs/ ; ping Mehdi/Teresa before the release if not done] - * Raised the minimum required CMake version to 3.4.3. +* Added the MemorySSA analysis, which hopes to replace MemoryDependenceAnalysis. + It should provide higher-quality results than MemDep, and be algorithmically + faster than MemDep. Currently, GVNHoist (which is off by default) makes use of + MemorySSA. + .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet @@ -93,6 +98,32 @@ Non-comprehensive list of changes in this release Makes programs 10x faster by doing Special New Thing. +GCC ABI Tag +----------- + +Recently, many of the Linux distributions (ex. `Fedora `_, +`Debian `_, `Ubuntu `_) +have moved on to use the new `GCC ABI `_ +to work around `C++11 incompatibilities in libstdc++ `_. +This caused `incompatibility problems `_ +with other compilers (ex. Clang), which needed to be fixed, but due to the +experimental nature of GCC's own implementation, it took a long time for it to +land in LLVM (`here `_ and +`here `_), not in time for the 3.8 release. + +Those patches are now present in the 3.9.0 release and should be working on the +majority of cases, as they have been tested thoroughly. However, some bugs were +`filled in GCC `_ and have not +yet been fixed, so there may be corner cases not covered by either GCC or Clang. +Bug fixes to those problems should be reported in Bugzilla (either LLVM or GCC), +and patches to LLVM's trunk are very likely to be back-ported to future 3.9.x +releases (depends on how destructive it is). + +Unfortunately, these patches won't be back-ported to 3.8.x or earlier, so we +strongly recommend people to use 3.9.x when GCC ABI cases are at stake. + +For a more in-depth view of the issue, check our `Bugzilla entry `_. + Changes to the LLVM IR ---------------------- @@ -110,16 +141,98 @@ link-time may be differently optimized than the one what was visible during optimization, and may have arbitrarily different observable behavior. See `PR26774 `_ for more details. -Changes to the ARM Backend +Support for ThinLTO +------------------- + +LLVM now supports ThinLTO compilation, which can be invoked by compiling +and linking with -flto=thin. The gold linker plugin, as well as linkers +that use the new ThinLTO API in libLTO (like ld64), will transparently +execute the ThinLTO backends in parallel threads. +For more information on ThinLTO and the LLVM implementation, see the +`ThinLTO blog post `_. + +Changes to the ARM Targets -------------------------- - During this release ... +**During this release the AArch64 backend has:** + +* Gained support for Qualcomm's Kryo and Broadcom's Vulcan CPUs, including + scheduling models. +* Landed a scheduling model for Samsung's Exynos M1. +* Seen a lot of work on GlobalISel. +* Learned a few more useful combines (fadd and fmul into fmadd, adjustments to the + stack pointer for callee-save stack memory and local stack memory etc). +* Gained support for the Swift calling convention. +* Switched to using SubtargetFeatures rather than testing for specific CPUs and + to using TableGen for handling system instruction operands. +* Like ARM, AArch64 is now using the TargetParser, so no more StringSwitches + matching CPU, FPU or feature names will be accepted in normal code. +* Clang can now self-host itself using LLD on AArch64. +* Gained a big batch of tests from Halide. + + Furthermore, LLDB now supports AArch64 compact unwind tables, as used on iOS, + tvos and watchos. + +**During this release the ARM target has:** + +* ARMv8.2-A can now be targeted directly via Clang flags. +* Adding preliminary support for Cortex-R8. +* LLDB can now parse EABI attributes for an ELF input. +* Initial ARM/Thumb support was added to LLD. +* The ExecutionEngine now supports COFF/ARM. +* Swift calling convention was ported to ARM. +* A large number of codegen fixes around ARMv8, DSP, correct sub-target support, + relocations, EABI, EHABI, Windows on ARM, atomics.. +* Improved assembler support for Linux/Android/Chromium sub-projects. +* Initial support for MUSL (libc) on ARM. +* Support for Thumb1 targets in libunwind. +* Gained a big batch of tests from Halide. Changes to the MIPS Target -------------------------- - During this release ... +**During this release the MIPS target has:** + +* Enabled the Integrated Assembler by default for all ``mips-*`` and + ``mipsel-*`` triples. +* Significantly improved the Integrated Assembler support for the n64 ABI. +* Added the Clang frontend ``-mcompact-branches={never,optimal,always}`` option + that controls how LLVM generates compact branches for MIPS targets. +* Improved performance and code size for stack pointer adjustments in functions + with large frames. +* Implemented many instructions from the microMIPS32R6 ISA and added CodeGen + support for most of them. +* Added support for the triple used by Debian Stretch for little endian + MIPS64, ie. ``mips64el-linux-gnuabi64``. +* Removed EABI which was neither tested nor properly supported. +* Gained the ability to self-host on MIPS32R6. +* Gained the ability to self-host on MIPS64R2 and MIPS64R6 when using the n64 + ABI. +* Added support for the ``LA`` macro in PIC mode for o32. +* Added support for safestack in compiler-rt. +* Added support for the MIPS n64 ABI in LLD. +* Added LLD support for TLS relocations for both o32 and n64 MIPS ABIs. + +**The MIPS target has also fixed various bugs including the following notable +fixes:** + +* Delay slots are no longer filled multiple times when either ``-save-temps`` + or ``-via-file-asm`` are used. +* Updated n32 and n64 to follow the standard ELF conventions for label prefixes + (``.L``), whereas o32 still uses its own (``$``). +* Properly sign-extend values to GPR width for instructions that expect 32-bit + values on 64-bit ISAs. +* Several fixes for the delay-slot filler pass, including correct + forbidden-slot hazard handling. +* Fixed several errors caught by the machine verifier when turned on for MIPS. +* Fixed broken predicate for ``SELECT`` patterns in MIPS64. +* Fixed wrong truncation of memory address for ``LL``/``SC`` seqeuences in + MIPS64. +* Fixed the o32, n32 and n64 handling of ``.cprestore`` directives when inside + a ``.set noat`` region by the Integrated Assembler. +* Fixed the ordering of ``HI``/``LO`` pairs in the relocation table. +* Fixed the generated ELF ``EFlags`` when Octeon is the target. Changes to the PowerPC Target @@ -140,9 +253,16 @@ Changes to the X86 Target extensions using ``-march=knl``. The switch enables the ISA extensions AVX-512{F, CD, ER, PF}. +* LLVM will now prefer ``PUSH`` instructions rather than ``%esp``-relative + ``MOV`` instructions for function calls at all optimization levels greater + than ``-O0``. Previously this transformation only occurred at ``-Os``. + Changes to the AMDGPU Target ----------------------------- + * Added backend support for OpenGL shader image, buffer storage, atomic + counter, and compute shader extensions (supported since Mesa 12) + * Mesa 11.0.x is no longer supported @@ -167,6 +287,21 @@ projects that have already been updated to work with LLVM 3.9. * A project +LDC - the LLVM-based D compiler +------------------------------- + +`D `_ is a language with C-like syntax and static typing. It +pragmatically combines efficiency, control, and modeling power, with safety and +programmer productivity. D supports powerful concepts like Compile-Time Function +Execution (CTFE) and Template Meta-Programming, provides an innovative approach +to concurrency and offers many classical paradigms. + +`LDC `_ uses the frontend from the reference compiler +combined with LLVM as backend to produce efficient native code. LDC targets +x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM +and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64 +are underway. + Additional Information ====================== diff --git a/docs/SegmentedStacks.rst b/docs/SegmentedStacks.rst index c0bf32b..b1c588c 100644 --- a/docs/SegmentedStacks.rst +++ b/docs/SegmentedStacks.rst @@ -33,7 +33,7 @@ current stack limit (minus the amount of space needed to allocate a new block) - this slot's offset is again dictated by ``libgcc``. The generated assembly looks like this on x86-64: -.. code-block:: nasm +.. code-block:: text leaq -8(%rsp), %r10 cmpq %fs:112, %r10 diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst index 1815ee3..8c3142e 100644 --- a/docs/SourceLevelDebugging.rst +++ b/docs/SourceLevelDebugging.rst @@ -230,7 +230,7 @@ following C fragment, for example: Compiled to LLVM, this function would be represented like this: -.. code-block:: llvm +.. code-block:: text ; Function Attrs: nounwind ssp uwtable define void @foo() #0 !dbg !4 { @@ -303,7 +303,7 @@ The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the variable ``X``. The metadata ``!dbg !14`` attached to the intrinsic provides scope information for the variable ``X``. -.. code-block:: llvm +.. code-block:: text !14 = !DILocation(line: 2, column: 9, scope: !4) !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, @@ -327,7 +327,7 @@ The third intrinsic ``%llvm.dbg.declare`` encodes debugging information for variable ``Z``. The metadata ``!dbg !19`` attached to the intrinsic provides scope information for the variable ``Z``. -.. code-block:: llvm +.. code-block:: text !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5) !19 = !DILocation(line: 5, column: 11, scope: !18) @@ -390,7 +390,7 @@ Given an integer global variable declared as follows: a C/C++ front-end would generate the following descriptors: -.. code-block:: llvm +.. code-block:: text ;; ;; Define the global itself. @@ -456,7 +456,7 @@ Given a function declared as follows: a C/C++ front-end would generate the following descriptors: -.. code-block:: llvm +.. code-block:: text ;; ;; Define the anchor for subprograms. diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst index a78ab3c..29b1be3 100644 --- a/docs/Statepoints.rst +++ b/docs/Statepoints.rst @@ -138,7 +138,7 @@ SSA value ``%obj.relocated`` which represents the potentially changed value of ``%obj`` after the safepoint and update any following uses appropriately. The resulting relocation sequence is: -.. code-block:: llvm +.. code-block:: text define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" { @@ -237,7 +237,7 @@ afterwards. If we extend our previous example to include a pointless derived pointer, we get: -.. code-block:: llvm +.. code-block:: text define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" { @@ -283,7 +283,7 @@ Let's assume a hypothetical GC--somewhat unimaginatively named "hypothetical-gc" --that requires that a TLS variable must be written to before and after a call to unmanaged code. The resulting relocation sequence is: -.. code-block:: llvm +.. code-block:: text @flag = thread_local global i32 0, align 4 @@ -662,7 +662,7 @@ distinguish between GC references and non-GC references in IR it is given. As an example, given this code: -.. code-block:: llvm +.. code-block:: text define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" { @@ -672,7 +672,7 @@ As an example, given this code: The pass would produce this IR: -.. code-block:: llvm +.. code-block:: text define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" { @@ -737,7 +737,7 @@ As an example, given input IR of the following: This pass would produce the following IR: -.. code-block:: llvm +.. code-block:: text define void @test() gc "statepoint-example" { %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0) diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst index a148634..c1391e7 100644 --- a/docs/TableGen/LangIntro.rst +++ b/docs/TableGen/LangIntro.rst @@ -232,7 +232,7 @@ the record ends with a semicolon. Here is a simple TableGen file: -.. code-block:: llvm +.. code-block:: text class C { bit V = 1; } def X : C; @@ -276,7 +276,7 @@ derived class or definition wants to override. Let expressions consist of the value. For example, a new class could be added to the example above, redefining the ``V`` field for all of its subclasses: -.. code-block:: llvm +.. code-block:: text class D : C { let V = 0; } def Z : D; @@ -295,7 +295,7 @@ concrete classes. Parameterized TableGen classes specify a list of variable bindings (which may optionally have defaults) that are bound when used. Here is a simple example: -.. code-block:: llvm +.. code-block:: text class FPFormat val> { bits<3> Value = val; @@ -316,7 +316,7 @@ integer. The more esoteric forms of `TableGen expressions`_ are useful in conjunction with template arguments. As an example: -.. code-block:: llvm +.. code-block:: text class ModRefVal val> { bits<2> Value = val; @@ -346,7 +346,7 @@ be used to decouple the interface provided to the user of the class from the actual internal data representation expected by the class. In this case, running ``llvm-tblgen`` on the example prints the following definitions: -.. code-block:: llvm +.. code-block:: text def bork { // Value bit isMod = 1; @@ -379,7 +379,7 @@ commonality exists, then in a separate place indicate what all the ops are. Here is an example TableGen fragment that shows this idea: -.. code-block:: llvm +.. code-block:: text def ops; def GPR; @@ -405,7 +405,7 @@ inherit from multiple multiclasses, instantiating definitions from each multiclass. Using a multiclass this way is exactly equivalent to instantiating the classes multiple times yourself, e.g. by writing: -.. code-block:: llvm +.. code-block:: text def ops; def GPR; @@ -432,7 +432,7 @@ the classes multiple times yourself, e.g. by writing: A ``defm`` can also be used inside a multiclass providing several levels of multiclass instantiations. -.. code-block:: llvm +.. code-block:: text class Instruction opc, string Name> { bits<4> opcode = opc; @@ -473,7 +473,7 @@ multiclass instantiations. the class list must start after the last multiclass, and there must be at least one multiclass before them. -.. code-block:: llvm +.. code-block:: text class XD { bits<4> Prefix = 11; } class XS { bits<4> Prefix = 12; } @@ -516,7 +516,7 @@ specified file in place of the include directive. The filename should be specified as a double quoted string immediately after the '``include``' keyword. Example: -.. code-block:: llvm +.. code-block:: text include "foo.td" @@ -532,7 +532,7 @@ commonality from the records. File-scope "let" expressions take a comma-separated list of bindings to apply, and one or more records to bind the values in. Here are some examples: -.. code-block:: llvm +.. code-block:: text let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in def RET : I<0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)]>; @@ -559,7 +559,7 @@ ways to factor out commonality from the records, specially if using several levels of multiclass instantiations. This also avoids the need of using "let" expressions within subsequent records inside a multiclass. -.. code-block:: llvm +.. code-block:: text multiclass basic_r opc> { let Predicates = [HasSSE2] in { @@ -587,7 +587,7 @@ TableGen supports the '``foreach``' block, which textually replicates the loop body, substituting iterator values for iterator references in the body. Example: -.. code-block:: llvm +.. code-block:: text foreach i = [0, 1, 2, 3] in { def R#i : Register<...>; @@ -598,7 +598,7 @@ This will create objects ``R0``, ``R1``, ``R2`` and ``R3``. ``foreach`` blocks may be nested. If there is only one item in the body the braces may be elided: -.. code-block:: llvm +.. code-block:: text foreach i = [0, 1, 2, 3] in def R#i : Register<...>; diff --git a/docs/TableGen/index.rst b/docs/TableGen/index.rst index 9526240..5ba555a 100644 --- a/docs/TableGen/index.rst +++ b/docs/TableGen/index.rst @@ -90,7 +90,7 @@ of the classes, then all of the definitions. This is a good way to see what the various definitions expand to fully. Running this on the ``X86.td`` file prints this (at the time of this writing): -.. code-block:: llvm +.. code-block:: text ... def ADD32rr { // Instruction X86Inst I @@ -155,7 +155,7 @@ by the code generator, and specifying it all manually would be unmaintainable, prone to bugs, and tiring to do in the first place. Because we are using TableGen, all of the information was derived from the following definition: -.. code-block:: llvm +.. code-block:: text let Defs = [EFLAGS], isCommutable = 1, // X = ADD Y,Z --> X = ADD Z,Y @@ -201,7 +201,7 @@ TableGen. **TableGen definitions** are the concrete form of 'records'. These generally do not have any undefined values, and are marked with the '``def``' keyword. -.. code-block:: llvm +.. code-block:: text def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP">; @@ -220,7 +220,7 @@ floating point instructions in the X86 backend). TableGen keeps track of all of the classes that are used to build up a definition, so the backend can find all definitions of a particular class, such as "Instruction". -.. code-block:: llvm +.. code-block:: text class ProcNoItin Features> : Processor; @@ -235,7 +235,7 @@ If a multiclass inherits from another multiclass, the definitions in the sub-multiclass become part of the current multiclass, as if they were declared in the current multiclass. -.. code-block:: llvm +.. code-block:: text multiclass ro_signed_pats { diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst index 023f6ff..f0f3ab5 100644 --- a/docs/WritingAnLLVMBackend.rst +++ b/docs/WritingAnLLVMBackend.rst @@ -345,7 +345,7 @@ to define an object for each register. The specified string ``n`` becomes the ``Name`` of the register. The basic ``Register`` object does not have any subregisters and does not specify any aliases. -.. code-block:: llvm +.. code-block:: text class Register { string Namespace = ""; @@ -361,7 +361,7 @@ subregisters and does not specify any aliases. For example, in the ``X86RegisterInfo.td`` file, there are register definitions that utilize the ``Register`` class, such as: -.. code-block:: llvm +.. code-block:: text def AL : Register<"AL">, DwarfRegNum<[0, 0, 0]>; @@ -414,7 +414,7 @@ classes. In ``Target.td``, the ``Register`` class is the base for the ``RegisterWithSubRegs`` class that is used to define registers that need to specify subregisters in the ``SubRegs`` list, as shown here: -.. code-block:: llvm +.. code-block:: text class RegisterWithSubRegs subregs> : Register { let SubRegs = subregs; @@ -427,7 +427,7 @@ feature common to these subclasses. Note the use of "``let``" expressions to override values that are initially defined in a superclass (such as ``SubRegs`` field in the ``Rd`` class). -.. code-block:: llvm +.. code-block:: text class SparcReg : Register { field bits<5> Num; @@ -452,7 +452,7 @@ field in the ``Rd`` class). In the ``SparcRegisterInfo.td`` file, there are register definitions that utilize these subclasses of ``Register``, such as: -.. code-block:: llvm +.. code-block:: text def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>; def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>; @@ -478,7 +478,7 @@ default allocation order of the registers. A target description file ``XXXRegisterInfo.td`` that uses ``Target.td`` can construct register classes using the following class: -.. code-block:: llvm +.. code-block:: text class RegisterClass regTypes, int alignment, dag regList> { @@ -532,7 +532,7 @@ defines a group of 32 single-precision floating-point registers (``F0`` to ``F31``); ``DFPRegs`` defines a group of 16 double-precision registers (``D0-D15``). -.. code-block:: llvm +.. code-block:: text // F0, F1, F2, ..., F31 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>; @@ -703,7 +703,7 @@ which describes one instruction. An instruction descriptor defines: The Instruction class (defined in ``Target.td``) is mostly used as a base for more complex instruction classes. -.. code-block:: llvm +.. code-block:: text class Instruction { string Namespace = ""; @@ -760,7 +760,7 @@ specific operation value for ``LD``/Load Word. The third parameter is the output destination, which is a register operand and defined in the ``Register`` target description file (``IntRegs``). -.. code-block:: llvm +.. code-block:: text def LDrr : F3_1 <3, 0b000000, (outs IntRegs:$dst), (ins MEMrr:$addr), "ld [$addr], $dst", @@ -769,7 +769,7 @@ target description file (``IntRegs``). The fourth parameter is the input source, which uses the address operand ``MEMrr`` that is defined earlier in ``SparcInstrInfo.td``: -.. code-block:: llvm +.. code-block:: text def MEMrr : Operand { let PrintMethod = "printMemOperand"; @@ -788,7 +788,7 @@ immediate value operands. For example, to perform a Load Integer instruction for a Word from an immediate operand to a register, the following instruction class is defined: -.. code-block:: llvm +.. code-block:: text def LDri : F3_2 <3, 0b000000, (outs IntRegs:$dst), (ins MEMri:$addr), "ld [$addr], $dst", @@ -801,7 +801,7 @@ creation of templates to define several instruction classes at once (using the pattern ``F3_12`` is defined to create 2 instruction classes each time ``F3_12`` is invoked: -.. code-block:: llvm +.. code-block:: text multiclass F3_12 Op3Val, SDNode OpNode> { def rr : F3_1 <2, Op3Val, @@ -818,7 +818,7 @@ So when the ``defm`` directive is used for the ``XOR`` and ``ADD`` instructions, as seen below, it creates four instruction objects: ``XORrr``, ``XORri``, ``ADDrr``, and ``ADDri``. -.. code-block:: llvm +.. code-block:: text defm XOR : F3_12<"xor", 0b000011, xor>; defm ADD : F3_12<"add", 0b000000, add>; @@ -830,7 +830,7 @@ For example, the 10\ :sup:`th` bit represents the "greater than" condition for integers, and the 22\ :sup:`nd` bit represents the "greater than" condition for floats. -.. code-block:: llvm +.. code-block:: text def ICC_NE : ICC_VAL< 9>; // Not Equal def ICC_E : ICC_VAL< 1>; // Equal @@ -855,7 +855,7 @@ order they are defined. Fields are bound when they are assigned a value. For example, the Sparc target defines the ``XNORrr`` instruction as a ``F3_1`` format instruction having three operands. -.. code-block:: llvm +.. code-block:: text def XNORrr : F3_1<2, 0b000111, (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c), @@ -865,7 +865,7 @@ format instruction having three operands. The instruction templates in ``SparcInstrFormats.td`` show the base class for ``F3_1`` is ``InstSP``. -.. code-block:: llvm +.. code-block:: text class InstSP pattern> : Instruction { field bits<32> Inst; @@ -880,7 +880,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for ``InstSP`` leaves the ``op`` field unbound. -.. code-block:: llvm +.. code-block:: text class F3 pattern> : InstSP { @@ -897,7 +897,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for fields. ``F3`` format instructions will bind the operands ``rd``, ``op3``, and ``rs1`` fields. -.. code-block:: llvm +.. code-block:: text class F3_1 opVal, bits<6> op3val, dag outs, dag ins, string asmstr, list pattern> : F3 { @@ -925,7 +925,7 @@ TableGen definition will add all of its operands to an enumeration in the llvm::XXX:OpName namespace and also add an entry for it into the OperandMap table, which can be queried using getNamedOperandIdx() -.. code-block:: llvm +.. code-block:: text int DstIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::dst); // => 0 int BIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::b); // => 1 @@ -972,7 +972,7 @@ For example, the X86 backend defines ``brtarget`` and ``brtarget8``, both instances of the TableGen ``Operand`` class, which represent branch target operands: -.. code-block:: llvm +.. code-block:: text def brtarget : Operand; def brtarget8 : Operand; @@ -1222,14 +1222,14 @@ definitions in ``XXXInstrInfo.td``. For example, in ``SparcInstrInfo.td``, this entry defines a register store operation, and the last parameter describes a pattern with the store DAG operator. -.. code-block:: llvm +.. code-block:: text def STrr : F3_1< 3, 0b000100, (outs), (ins MEMrr:$addr, IntRegs:$src), "st $src, [$addr]", [(store i32:$src, ADDRrr:$addr)]>; ``ADDRrr`` is a memory mode that is also defined in ``SparcInstrInfo.td``: -.. code-block:: llvm +.. code-block:: text def ADDRrr : ComplexPattern; @@ -1240,7 +1240,7 @@ defined in an implementation of the Instructor Selector (such as In ``lib/Target/TargetSelectionDAG.td``, the DAG operator for store is defined below: -.. code-block:: llvm +.. code-block:: text def store : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ @@ -1458,7 +1458,7 @@ if the current argument is of type ``f32`` or ``f64``), then the action is performed. In this case, the ``CCAssignToReg`` action assigns the argument value to the first available register: either ``R0`` or ``R1``. -.. code-block:: llvm +.. code-block:: text CCIfType<[f32,f64], CCAssignToReg<[R0, R1]>> @@ -1469,7 +1469,7 @@ which registers are used for specified scalar return types. A single-precision float is returned to register ``F0``, and a double-precision float goes to register ``D0``. A 32-bit integer is returned in register ``I0`` or ``I1``. -.. code-block:: llvm +.. code-block:: text def RetCC_Sparc32 : CallingConv<[ CCIfType<[i32], CCAssignToReg<[I0, I1]>>, @@ -1484,7 +1484,7 @@ the size of the slot, and the second parameter, also 4, indicates the stack alignment along 4-byte units. (Special cases: if size is zero, then the ABI size is used; if alignment is zero, then the ABI alignment is used.) -.. code-block:: llvm +.. code-block:: text def CC_Sparc32 : CallingConv<[ // All arguments get passed in integer registers if there is space. @@ -1499,7 +1499,7 @@ the following example (in ``X86CallingConv.td``), the definition of assigned to the register ``ST0`` or ``ST1``, the ``RetCC_X86Common`` is invoked. -.. code-block:: llvm +.. code-block:: text def RetCC_X86_32_C : CallingConv<[ CCIfType<[f32], CCAssignToReg<[ST0, ST1]>>, @@ -1514,7 +1514,7 @@ then a specified action is invoked. In the following example (in ``RetCC_X86_32_Fast`` is invoked. If the ``SSECall`` calling convention is in use, then ``RetCC_X86_32_SSE`` is invoked. -.. code-block:: llvm +.. code-block:: text def RetCC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::Fast", CCDelegateTo>, @@ -1682,7 +1682,7 @@ feature, the value of the attribute, and a description of the feature. (The fifth parameter is a list of features whose presence is implied, and its default value is an empty array.) -.. code-block:: llvm +.. code-block:: text class SubtargetFeature i = []> { @@ -1696,7 +1696,7 @@ default value is an empty array.) In the ``Sparc.td`` file, the ``SubtargetFeature`` is used to define the following features. -.. code-block:: llvm +.. code-block:: text def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true", "Enable SPARC-V9 instructions">; @@ -1710,7 +1710,7 @@ Elsewhere in ``Sparc.td``, the ``Proc`` class is defined and then is used to define particular SPARC processor subtypes that may have the previously described features. -.. code-block:: llvm +.. code-block:: text class Proc Features> : Processor; diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst index 9e9d9f1..537bbbc 100644 --- a/docs/WritingAnLLVMPass.rst +++ b/docs/WritingAnLLVMPass.rst @@ -747,7 +747,7 @@ template parameter is the name of the pass that is to be used on the command line to specify that the pass should be added to a program (for example, with :program:`opt` or :program:`bugpoint`). The first argument is the name of the pass, which is to be used for the :option:`-help` output of programs, as well -as for debug output generated by the :option:`--debug-pass` option. +as for debug output generated by the `--debug-pass` option. If you want your pass to be easily dumpable, you should implement the virtual print method: diff --git a/docs/index.rst b/docs/index.rst index ef1d4ec..a68dd1b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,11 +1,6 @@ Overview ======== -.. warning:: - - If you are using a released version of LLVM, see `the download page - `_ to find your documentation. - The LLVM compiler infrastructure supports a wide range of projects, from industrial strength compilers to specialized JIT applications to small research projects. diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h index 6bdb96a..76f8b31 100644 --- a/include/llvm-c/Core.h +++ b/include/llvm-c/Core.h @@ -2014,6 +2014,9 @@ void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA); void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, LLVMAttributeRef A); +unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx); +void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs); LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unsigned KindID); @@ -2600,6 +2603,9 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, LLVMAttributeRef A); +unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, LLVMAttributeIndex Idx); +void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs); LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, unsigned KindID); diff --git a/include/llvm/ADT/GraphTraits.h b/include/llvm/ADT/GraphTraits.h index 823caef..eb67b7c 100644 --- a/include/llvm/ADT/GraphTraits.h +++ b/include/llvm/ADT/GraphTraits.h @@ -27,19 +27,24 @@ template struct GraphTraits { // Elements to provide: + // NOTICE: We are in a transition from migration interfaces that require + // NodeType *, to NodeRef. NodeRef is required to be cheap to copy, but does + // not have to be a raw pointer. In the transition, user should define + // NodeType, and NodeRef = NodeType *. + // // typedef NodeType - Type of Node in the graph + // typedef NodeRef - NodeType * // typedef ChildIteratorType - Type used to iterate over children in graph - // static NodeType *getEntryNode(const GraphType &) + // static NodeRef getEntryNode(const GraphType &) // Return the entry node of the graph - // static ChildIteratorType child_begin(NodeType *) - // static ChildIteratorType child_end (NodeType *) + // static ChildIteratorType child_begin(NodeRef) + // static ChildIteratorType child_end (NodeRef) // Return iterators that point to the beginning and ending of the child // node list for the specified node. // - // typedef ...iterator nodes_iterator; // static nodes_iterator nodes_begin(GraphType *G) // static nodes_iterator nodes_end (GraphType *G) @@ -57,7 +62,7 @@ struct GraphTraits { // your argument to XXX_begin(...) is unknown or needs to have the proper .h // file #include'd. // - typedef typename GraphType::UnknownGraphTypeError NodeType; + typedef typename GraphType::UnknownGraphTypeError NodeRef; }; diff --git a/include/llvm/ADT/SCCIterator.h b/include/llvm/ADT/SCCIterator.h index bc74416..e89345c 100644 --- a/include/llvm/ADT/SCCIterator.h +++ b/include/llvm/ADT/SCCIterator.h @@ -37,23 +37,22 @@ namespace llvm { /// build up a vector of nodes in a particular SCC. Note that it is a forward /// iterator and thus you cannot backtrack or re-visit nodes. template > -class scc_iterator - : public iterator_facade_base< - scc_iterator, std::forward_iterator_tag, - const std::vector, ptrdiff_t> { - typedef typename GT::NodeType NodeType; +class scc_iterator : public iterator_facade_base< + scc_iterator, std::forward_iterator_tag, + const std::vector, ptrdiff_t> { + typedef typename GT::NodeRef NodeRef; typedef typename GT::ChildIteratorType ChildItTy; - typedef std::vector SccTy; + typedef std::vector SccTy; typedef typename scc_iterator::reference reference; /// Element of VisitStack during DFS. struct StackElement { - NodeType *Node; ///< The current node pointer. + NodeRef Node; ///< The current node pointer. ChildItTy NextChild; ///< The next child, modified inplace during DFS. unsigned MinVisited; ///< Minimum uplink value of all children of Node. - StackElement(NodeType *Node, const ChildItTy &Child, unsigned Min) - : Node(Node), NextChild(Child), MinVisited(Min) {} + StackElement(NodeRef Node, const ChildItTy &Child, unsigned Min) + : Node(Node), NextChild(Child), MinVisited(Min) {} bool operator==(const StackElement &Other) const { return Node == Other.Node && @@ -67,10 +66,10 @@ class scc_iterator /// /// nodeVisitNumbers are per-node visit numbers, also used as DFS flags. unsigned visitNum; - DenseMap nodeVisitNumbers; + DenseMap nodeVisitNumbers; /// Stack holding nodes of the SCC. - std::vector SCCNodeStack; + std::vector SCCNodeStack; /// The current SCC, retrieved using operator*(). SccTy CurrentSCC; @@ -80,7 +79,7 @@ class scc_iterator std::vector VisitStack; /// A single "visit" within the non-recursive DFS traversal. - void DFSVisitOne(NodeType *N); + void DFSVisitOne(NodeRef N); /// The stack-based DFS traversal; defined below. void DFSVisitChildren(); @@ -88,7 +87,7 @@ class scc_iterator /// Compute the next SCC using the DFS traversal. void GetNextSCC(); - scc_iterator(NodeType *entryN) : visitNum(0) { + scc_iterator(NodeRef entryN) : visitNum(0) { DFSVisitOne(entryN); GetNextSCC(); } @@ -131,7 +130,7 @@ public: /// This informs the \c scc_iterator that the specified \c Old node /// has been deleted, and \c New is to be used in its place. - void ReplaceNode(NodeType *Old, NodeType *New) { + void ReplaceNode(NodeRef Old, NodeRef New) { assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?"); nodeVisitNumbers[New] = nodeVisitNumbers[Old]; nodeVisitNumbers.erase(Old); @@ -139,7 +138,7 @@ public: }; template -void scc_iterator::DFSVisitOne(NodeType *N) { +void scc_iterator::DFSVisitOne(NodeRef N) { ++visitNum; nodeVisitNumbers[N] = visitNum; SCCNodeStack.push_back(N); @@ -155,8 +154,8 @@ void scc_iterator::DFSVisitChildren() { assert(!VisitStack.empty()); while (VisitStack.back().NextChild != GT::child_end(VisitStack.back().Node)) { // TOS has at least one more child so continue DFS - NodeType *childN = *VisitStack.back().NextChild++; - typename DenseMap::iterator Visited = + NodeRef childN = *VisitStack.back().NextChild++; + typename DenseMap::iterator Visited = nodeVisitNumbers.find(childN); if (Visited == nodeVisitNumbers.end()) { // this node has never been seen. @@ -176,7 +175,7 @@ template void scc_iterator::GetNextSCC() { DFSVisitChildren(); // Pop the leaf on top of the VisitStack. - NodeType *visitingN = VisitStack.back().Node; + NodeRef visitingN = VisitStack.back().Node; unsigned minVisitNum = VisitStack.back().MinVisited; assert(VisitStack.back().NextChild == GT::child_end(visitingN)); VisitStack.pop_back(); @@ -212,7 +211,7 @@ bool scc_iterator::hasLoop() const { assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!"); if (CurrentSCC.size() > 1) return true; - NodeType *N = CurrentSCC.front(); + NodeRef N = CurrentSCC.front(); for (ChildItTy CI = GT::child_begin(N), CE = GT::child_end(N); CI != CE; ++CI) if (*CI == N) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index abd39da..00b796f 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -26,10 +26,18 @@ #include #include // for std::pair +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Compiler.h" namespace llvm { +namespace detail { + +template +using IterOfRange = decltype(std::begin(std::declval())); + +} // End detail namespace //===----------------------------------------------------------------------===// // Extra additions to @@ -235,6 +243,90 @@ auto reverse( llvm::make_reverse_iterator(std::begin(C))); } +/// An iterator adaptor that filters the elements of given inner iterators. +/// +/// The predicate parameter should be a callable object that accepts the wrapped +/// iterator's reference type and returns a bool. When incrementing or +/// decrementing the iterator, it will call the predicate on each element and +/// skip any where it returns false. +/// +/// \code +/// int A[] = { 1, 2, 3, 4 }; +/// auto R = make_filter_range(A, [](int N) { return N % 2 == 1; }); +/// // R contains { 1, 3 }. +/// \endcode +template +class filter_iterator + : public iterator_adaptor_base< + filter_iterator, WrappedIteratorT, + typename std::common_type< + std::forward_iterator_tag, + typename std::iterator_traits< + WrappedIteratorT>::iterator_category>::type> { + using BaseT = iterator_adaptor_base< + filter_iterator, WrappedIteratorT, + typename std::common_type< + std::forward_iterator_tag, + typename std::iterator_traits::iterator_category>:: + type>; + + struct PayloadType { + WrappedIteratorT End; + PredicateT Pred; + }; + + Optional Payload; + + void findNextValid() { + assert(Payload && "Payload should be engaged when findNextValid is called"); + while (this->I != Payload->End && !Payload->Pred(*this->I)) + BaseT::operator++(); + } + + // Construct the begin iterator. The begin iterator requires to know where end + // is, so that it can properly stop when it hits end. + filter_iterator(WrappedIteratorT Begin, WrappedIteratorT End, PredicateT Pred) + : BaseT(std::move(Begin)), + Payload(PayloadType{std::move(End), std::move(Pred)}) { + findNextValid(); + } + + // Construct the end iterator. It's not incrementable, so Payload doesn't + // have to be engaged. + filter_iterator(WrappedIteratorT End) : BaseT(End) {} + +public: + using BaseT::operator++; + + filter_iterator &operator++() { + BaseT::operator++(); + findNextValid(); + return *this; + } + + template + friend iterator_range, PT>> + make_filter_range(RT &&, PT); +}; + +/// Convenience function that takes a range of elements and a predicate, +/// and return a new filter_iterator range. +/// +/// FIXME: Currently if RangeT && is a rvalue reference to a temporary, the +/// lifetime of that temporary is not kept by the returned range object, and the +/// temporary is going to be dropped on the floor after the make_iterator_range +/// full expression that contains this function call. +template +iterator_range, PredicateT>> +make_filter_range(RangeT &&Range, PredicateT Pred) { + using FilterIteratorT = + filter_iterator, PredicateT>; + return make_range(FilterIteratorT(std::begin(std::forward(Range)), + std::end(std::forward(Range)), + std::move(Pred)), + FilterIteratorT(std::end(std::forward(Range)))); +} + //===----------------------------------------------------------------------===// // Extra additions to //===----------------------------------------------------------------------===// diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index 4781304..b98f840 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -174,6 +174,7 @@ public: UnknownEnvironment, GNU, + GNUABI64, GNUEABI, GNUEABIHF, GNUX32, @@ -476,8 +477,9 @@ public: bool isGNUEnvironment() const { EnvironmentType Env = getEnvironment(); - return Env == Triple::GNU || Env == Triple::GNUEABI || - Env == Triple::GNUEABIHF || Env == Triple::GNUX32; + return Env == Triple::GNU || Env == Triple::GNUABI64 || + Env == Triple::GNUEABI || Env == Triple::GNUEABIHF || + Env == Triple::GNUX32; } /// Checks if the environment could be MSVC. diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h index 2898a67..0bd28d5 100644 --- a/include/llvm/ADT/iterator.h +++ b/include/llvm/ADT/iterator.h @@ -155,7 +155,14 @@ template < typename T = typename std::iterator_traits::value_type, typename DifferenceTypeT = typename std::iterator_traits::difference_type, - typename PointerT = T *, typename ReferenceT = T &, + typename PointerT = typename std::conditional< + std::is_same::value_type>::value, + typename std::iterator_traits::pointer, T *>::type, + typename ReferenceT = typename std::conditional< + std::is_same::value_type>::value, + typename std::iterator_traits::reference, T &>::type, // Don't provide these, they are mostly to act as aliases below. typename WrappedTraitsT = std::iterator_traits> class iterator_adaptor_base @@ -168,15 +175,7 @@ protected: iterator_adaptor_base() = default; - template - explicit iterator_adaptor_base( - U &&u, - typename std::enable_if< - !std::is_base_of::type>::type, - DerivedT>::value, - int>::type = 0) - : I(std::forward(u)) {} + explicit iterator_adaptor_base(WrappedIteratorT u) : I(std::move(u)) {} const WrappedIteratorT &wrapped() const { return I; } diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h index 4ecacb0..f37e843 100644 --- a/include/llvm/Analysis/CallGraph.h +++ b/include/llvm/Analysis/CallGraph.h @@ -410,6 +410,7 @@ public: // traversals. template <> struct GraphTraits { typedef CallGraphNode NodeType; + typedef CallGraphNode *NodeRef; typedef CallGraphNode::CallRecord CGNPairTy; typedef std::pointer_to_unary_function @@ -431,6 +432,7 @@ template <> struct GraphTraits { template <> struct GraphTraits { typedef const CallGraphNode NodeType; + typedef const CallGraphNode *NodeRef; typedef CallGraphNode::CallRecord CGNPairTy; typedef std::pointer_to_unary_function diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h index 2fa856a..1acf952 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpander.h +++ b/include/llvm/Analysis/ScalarEvolutionExpander.h @@ -196,6 +196,13 @@ namespace llvm { /// block. Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I); + /// \brief Insert code to directly compute the specified SCEV expression + /// into the program. The inserted code is inserted into the SCEVExpander's + /// current insertion point. If a type is specified, the result will be + /// expanded to have that type, with a cast if necessary. + Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr); + + /// \brief Generates a code sequence that evaluates this predicate. /// The inserted instructions will be at position \p Loc. /// The result will be of type i1 and will have a value of 0 when the @@ -253,6 +260,15 @@ namespace llvm { void enableLSRMode() { LSRMode = true; } + /// \brief Set the current insertion point. This is useful if multiple calls + /// to expandCodeFor() are going to be made with the same insert point and + /// the insert point may be moved during one of the expansions (e.g. if the + /// insert point is not a block terminator). + void setInsertPoint(Instruction *IP) { + assert(IP); + Builder.SetInsertPoint(IP); + } + /// \brief Clear the current insertion point. This is useful if the /// instruction that had been serving as the insertion point may have been /// deleted. @@ -313,12 +329,6 @@ namespace llvm { Value *expand(const SCEV *S); - /// \brief Insert code to directly compute the specified SCEV expression - /// into the program. The inserted code is inserted into the SCEVExpander's - /// current insertion point. If a type is specified, the result will be - /// expanded to have that type, with a cast if necessary. - Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr); - /// \brief Determine the most "relevant" loop for the given SCEV. const Loop *getRelevantLoop(const SCEV *); diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index d5f918e..2923371 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -740,6 +740,7 @@ struct MBB2NumberFunctor : template <> struct GraphTraits { typedef MachineBasicBlock NodeType; + typedef MachineBasicBlock *NodeRef; typedef MachineBasicBlock::succ_iterator ChildIteratorType; static NodeType *getEntryNode(MachineBasicBlock *BB) { return BB; } @@ -753,6 +754,7 @@ template <> struct GraphTraits { template <> struct GraphTraits { typedef const MachineBasicBlock NodeType; + typedef const MachineBasicBlock *NodeRef; typedef MachineBasicBlock::const_succ_iterator ChildIteratorType; static NodeType *getEntryNode(const MachineBasicBlock *BB) { return BB; } @@ -772,6 +774,7 @@ template <> struct GraphTraits { // template <> struct GraphTraits > { typedef MachineBasicBlock NodeType; + typedef MachineBasicBlock *NodeRef; typedef MachineBasicBlock::pred_iterator ChildIteratorType; static NodeType *getEntryNode(Inverse G) { return G.Graph; @@ -786,6 +789,7 @@ template <> struct GraphTraits > { template <> struct GraphTraits > { typedef const MachineBasicBlock NodeType; + typedef const MachineBasicBlock *NodeRef; typedef MachineBasicBlock::const_pred_iterator ChildIteratorType; static NodeType *getEntryNode(Inverse G) { return G.Graph; diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index af1bf0a..5ef0371 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -210,6 +210,7 @@ public: private: friend class AttrBuilder; friend class AttributeSetImpl; + friend class AttributeSetNode; template friend struct DenseMapInfo; /// \brief The attributes that we are managing. This can be null to represent diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h index e9bf093..a256b59 100644 --- a/include/llvm/IR/CFG.h +++ b/include/llvm/IR/CFG.h @@ -155,6 +155,7 @@ struct isPodLike> { template <> struct GraphTraits { typedef BasicBlock NodeType; + typedef BasicBlock *NodeRef; typedef succ_iterator ChildIteratorType; static NodeType *getEntryNode(BasicBlock *BB) { return BB; } @@ -168,6 +169,7 @@ template <> struct GraphTraits { template <> struct GraphTraits { typedef const BasicBlock NodeType; + typedef const BasicBlock *NodeRef; typedef succ_const_iterator ChildIteratorType; static NodeType *getEntryNode(const BasicBlock *BB) { return BB; } @@ -187,6 +189,7 @@ template <> struct GraphTraits { // template <> struct GraphTraits > { typedef BasicBlock NodeType; + typedef BasicBlock *NodeRef; typedef pred_iterator ChildIteratorType; static NodeType *getEntryNode(Inverse G) { return G.Graph; } static inline ChildIteratorType child_begin(NodeType *N) { @@ -199,6 +202,7 @@ template <> struct GraphTraits > { template <> struct GraphTraits > { typedef const BasicBlock NodeType; + typedef const BasicBlock *NodeRef; typedef const_pred_iterator ChildIteratorType; static NodeType *getEntryNode(Inverse G) { return G.Graph; diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 74c9715..b965f08 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -479,6 +479,8 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">, @@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; + def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; + def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; } // Vector bit test diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index d21d321..4586a17 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -2349,6 +2349,10 @@ public: /// from getBooleanContents(). bool isConstFalseVal(const SDNode *N) const; + /// Return a constant of type VT that contains a true value that respects + /// getBooleanContents() + SDValue getConstTrueVal(SelectionDAG &DAG, EVT VT, const SDLoc &DL) const; + /// Return if \p N is a True value when extended to \p VT. bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool Signed) const; diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 90bc249..c2039e1 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -623,6 +623,7 @@ template <> struct GraphTraits { typedef bfi_detail::IrreducibleGraph GraphT; typedef const GraphT::IrrNode NodeType; + typedef const GraphT::IrrNode *NodeRef; typedef GraphT::IrrNode::iterator ChildIteratorType; static const NodeType *getEntryNode(const GraphT &G) { diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 6c471ab..c9adaa7 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V, /// integer type Ty is used to select how many bits are available for the /// result. Returns null if the conversion cannot be performed, otherwise /// returns the Constant value resulting from the conversion. -Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero, - Type *Ty) { +Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero, + Type *Ty) { // All of these conversion intrinsics form an integer of at most 64bits. unsigned ResultWidth = Ty->getIntegerBitWidth(); assert(ResultWidth <= 64 && @@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero, APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth, /*isSigned=*/true, mode, &isExact); - if (status != APFloat::opOK && status != APFloat::opInexact) + if (status != APFloat::opOK && + (!roundTowardZero || status != APFloat::opInexact)) return nullptr; return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true); } @@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, case Intrinsic::x86_sse2_cvtsd2si: case Intrinsic::x86_sse2_cvtsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/false, Ty); + dyn_cast_or_null(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/false, Ty); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/true, Ty); + dyn_cast_or_null(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/true, Ty); } } diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 0cb2c78..aeaf938 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -3400,7 +3400,10 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal, return TrueVal; if (const auto *ICI = dyn_cast(CondVal)) { - unsigned BitWidth = Q.DL.getTypeSizeInBits(TrueVal->getType()); + // FIXME: This code is nearly duplicated in InstCombine. Using/refactoring + // decomposeBitTestICmp() might help. + unsigned BitWidth = + Q.DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); ICmpInst::Predicate Pred = ICI->getPredicate(); Value *CmpLHS = ICI->getOperand(0); Value *CmpRHS = ICI->getOperand(1); @@ -4274,7 +4277,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV, // Gracefully handle edge cases where the instruction is not wired into any // parent block. - if (I->getParent()) + if (I->getParent() && !I->isEHPad() && !isa(I) && + !I->mayHaveSideEffects()) I->eraseFromParent(); } else { Worklist.insert(I); @@ -4302,7 +4306,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV, // Gracefully handle edge cases where the instruction is not wired into any // parent block. - if (I->getParent()) + if (I->getParent() && !I->isEHPad() && !isa(I) && + !I->mayHaveSideEffects()) I->eraseFromParent(); } return Simplified; diff --git a/lib/Analysis/LoopUnrollAnalyzer.cpp b/lib/Analysis/LoopUnrollAnalyzer.cpp index f59257a..7bdf340 100644 --- a/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -115,13 +115,19 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) { // We might have a vector load from an array. FIXME: for now we just bail // out in this case, but we should be able to resolve and simplify such // loads. - if(CDS->getElementType() != I.getType()) + if (CDS->getElementType() != I.getType()) return false; - int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; - if (SimplifiedAddrOp->getValue().getActiveBits() >= 64) + unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + if (SimplifiedAddrOp->getValue().getActiveBits() > 64) return false; - int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize; + int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue(); + if (SimplifiedAddrOpV < 0) { + // FIXME: For now we conservatively ignore out of bound accesses, but + // we're allowed to perform the optimization in this case. + return false; + } + uint64_t Index = static_cast(SimplifiedAddrOpV) / ElemSize; if (Index >= CDS->getNumElements()) { // FIXME: For now we conservatively ignore out of bound accesses, but // we're allowed to perform the optimization in this case. diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 77e4ec7..2e45bb8 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -1610,8 +1610,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, Instruction *IP) { - assert(IP); - Builder.SetInsertPoint(IP); + setInsertPoint(IP); return expandCodeFor(SH, Ty); } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index b0ba571..ebf80de 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -214,10 +214,7 @@ TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) { } TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) { - // It's possible to ask for the FuncId of a function which doesn't have a - // subprogram: inlining a function with debug info into a function with none. - if (!SP) - return TypeIndex::None(); + assert(SP); // Check if we've already translated this subprogram. auto I = TypeIndices.find({SP, nullptr}); @@ -621,11 +618,12 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, std::string FuncName; auto *SP = GV->getSubprogram(); + assert(SP); setCurrentSubprogram(SP); // If we have a display name, build the fully qualified name by walking the // chain of scopes. - if (SP != nullptr && !SP->getDisplayName().empty()) + if (!SP->getDisplayName().empty()) FuncName = getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName()); @@ -864,7 +862,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { void CodeViewDebug::beginFunction(const MachineFunction *MF) { assert(!CurFn && "Can't process two functions at once!"); - if (!Asm || !MMI->hasDebugInfo()) + if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) return; DebugHandlerBase::beginFunction(MF); @@ -1939,7 +1937,8 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); // Ignore DBG_VALUE locations and function prologue. - if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup)) + if (!Asm || !CurFn || MI->isDebugValue() || + MI->getFlag(MachineInstr::FrameSetup)) return; DebugLoc DL = MI->getDebugLoc(); if (DL == PrevInstLoc || !DL) diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index fa70576..23e2aa7 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -996,6 +996,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { MachineBasicBlock *IBB = &*I; MachineBasicBlock *PredBB = &*std::prev(I); MergePotentials.clear(); + MachineLoop *ML; + + // Bail if merging after placement and IBB is the loop header because + // -- If merging predecessors that belong to the same loop as IBB, the + // common tail of merged predecessors may become the loop top if block + // placement is called again and the predecessors may branch to this common + // tail and require more branches. This can be relaxed if + // MachineBlockPlacement::findBestLoopTop is more flexible. + // --If merging predecessors that do not belong to the same loop as IBB, the + // loop info of IBB's loop and the other loops may be affected. Calling the + // block placement again may make big change to the layout and eliminate the + // reason to do tail merging here. + if (AfterBlockPlacement && MLI) { + ML = MLI->getLoopFor(IBB); + if (ML && IBB == ML->getHeader()) + continue; + } + for (MachineBasicBlock *PBB : I->predecessors()) { if (MergePotentials.size() == TailMergeThreshold) break; @@ -1015,16 +1033,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { if (PBB->hasEHPadSuccessor()) continue; - // Bail out if the loop header (IBB) is not the top of the loop chain - // after the block placement. Otherwise, the common tail of IBB's - // predecessors may become the loop top if block placement is called again - // and the predecessors may branch to this common tail. - // FIXME: Relaxed this check if the algorithm of finding loop top is - // changed in MBP. + // After block placement, only consider predecessors that belong to the + // same loop as IBB. The reason is the same as above when skipping loop + // header. if (AfterBlockPlacement && MLI) - if (MachineLoop *ML = MLI->getLoopFor(IBB)) - if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB)) - continue; + if (ML != MLI->getLoopFor(PBB)) + continue; MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector Cond; diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp index 19cd59b..4a1b995 100644 --- a/lib/CodeGen/SafeStack.cpp +++ b/lib/CodeGen/SafeStack.cpp @@ -530,7 +530,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( unsigned Align = std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), - Align, SSC.getLiveRange(StackGuardSlot)); + Align, SSC.getFullLiveRange()); } for (Argument *Arg : ByValArguments) { diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp index 709614f..795eb8d 100644 --- a/lib/CodeGen/SafeStackColoring.cpp +++ b/lib/CodeGen/SafeStackColoring.cpp @@ -25,7 +25,9 @@ static cl::opt ClColoring("safe-stack-coloring", cl::Hidden, cl::init(true)); const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) { - return LiveRanges[AllocaNumbering[AI]]; + const auto IT = AllocaNumbering.find(AI); + assert(IT != AllocaNumbering.end()); + return LiveRanges[IT->second]; } bool StackColoring::readMarker(Instruction *I, bool *IsStart) { diff --git a/lib/CodeGen/SafeStackLayout.cpp b/lib/CodeGen/SafeStackLayout.cpp index b8190e0..fb433c1 100644 --- a/lib/CodeGen/SafeStackLayout.cpp +++ b/lib/CodeGen/SafeStackLayout.cpp @@ -100,7 +100,8 @@ void StackLayout::layoutObject(StackObject &Obj) { } // Split starting and ending regions if necessary. - for (StackRegion &R : Regions) { + for (unsigned i = 0; i < Regions.size(); ++i) { + StackRegion &R = Regions[i]; if (Start > R.Start && Start < R.End) { StackRegion R0 = R; R.Start = R0.End = Start; diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d888676..5ecc6da 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6198,13 +6198,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } } - // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0) - unsigned ElementWidth = VT.getScalarType().getSizeInBits(); + // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) + // Here, T can be 1 or -1, depending on the type of the setcc and + // getBooleanContents(). + unsigned SetCCWidth = N0.getValueType().getScalarSizeInBits(); + SDLoc DL(N); - SDValue NegOne = - DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT); + // To determine the "true" side of the select, we need to know the high bit + // of the value returned by the setcc if it evaluates to true. + // If the type of the setcc is i1, then the true case of the select is just + // sext(i1 1), that is, -1. + // If the type of the setcc is larger (say, i8) then the value of the high + // bit depends on getBooleanContents(). So, ask TLI for a real "true" value + // of the appropriate width. + SDValue ExtTrueVal = + (SetCCWidth == 1) + ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), + DL, VT) + : TLI.getConstTrueVal(DAG, VT, DL); + if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), NegOne, + DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, DAG.getConstant(0, DL, VT), cast(N0.getOperand(2))->get(), true)) return SCC; @@ -6215,10 +6229,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { SDLoc DL(N); ISD::CondCode CC = cast(N0.getOperand(2))->get(); - SDValue SetCC = DAG.getSetCC(DL, SetCCVT, - N0.getOperand(0), N0.getOperand(1), CC); - return DAG.getSelect(DL, VT, SetCC, - NegOne, DAG.getConstant(0, DL, VT)); + SDValue SetCC = + DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); + return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, + DAG.getConstant(0, DL, VT)); } } } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 8235522..29d11c7 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6639,19 +6639,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { SDNode *FromNode = From.getNode(); SDNode *ToNode = To.getNode(); ArrayRef DVs = GetDbgValues(FromNode); + SmallVector ClonedDVs; for (ArrayRef::iterator I = DVs.begin(), E = DVs.end(); I != E; ++I) { SDDbgValue *Dbg = *I; // Only add Dbgvalues attached to same ResNo. if (Dbg->getKind() == SDDbgValue::SDNODE && - Dbg->getResNo() == From.getResNo()) { + Dbg->getSDNode() == From.getNode() && + Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) { + assert(FromNode != ToNode && + "Should not transfer Debug Values intranode"); SDDbgValue *Clone = getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode, To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(), Dbg->getDebugLoc(), Dbg->getOrder()); - AddDbgValue(Clone, ToNode, false); + ClonedDVs.push_back(Clone); + Dbg->setIsInvalidated(); } } + for (SDDbgValue *I : ClonedDVs) + AddDbgValue(I, ToNode, false); } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f2bc88a..806646f 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1234,6 +1234,16 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const { llvm_unreachable("Invalid boolean contents"); } +SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT, + const SDLoc &DL) const { + unsigned ElementWidth = VT.getScalarSizeInBits(); + APInt TrueInt = + getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent + ? APInt(ElementWidth, 1) + : APInt::getAllOnesValue(ElementWidth); + return DAG.getConstant(TrueInt, DL, VT); +} + bool TargetLowering::isConstFalseVal(const SDNode *N) const { if (!N) return false; diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 3d9a518..8feb18b 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" @@ -539,6 +539,16 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { return TRI->regsOverlap(RegA, RegB); } +// Returns true if Reg is equal or aliased to at least one register in Set. +static bool regOverlapsSet(const SmallVectorImpl &Set, unsigned Reg, + const TargetRegisterInfo *TRI) { + for (unsigned R : Set) + if (TRI->regsOverlap(R, Reg)) + return true; + + return false; +} + /// Return true if it's potentially profitable to commute the two-address /// instruction that's being processed. bool @@ -864,9 +874,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, // FIXME: Needs more sophisticated heuristics. return false; - SmallSet Uses; - SmallSet Kills; - SmallSet Defs; + SmallVector Uses; + SmallVector Kills; + SmallVector Defs; for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; @@ -874,12 +884,12 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, if (!MOReg) continue; if (MO.isDef()) - Defs.insert(MOReg); + Defs.push_back(MOReg); else { - Uses.insert(MOReg); + Uses.push_back(MOReg); if (MOReg != Reg && (MO.isKill() || (LIS && isPlainlyKilled(MI, MOReg, LIS)))) - Kills.insert(MOReg); + Kills.push_back(MOReg); } } @@ -888,8 +898,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator AfterMI = std::next(Begin); MachineBasicBlock::iterator End = AfterMI; - while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) { - Defs.insert(End->getOperand(0).getReg()); + while (End->isCopy() && + regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI)) { + Defs.push_back(End->getOperand(0).getReg()); ++End; } @@ -915,21 +926,21 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, if (!MOReg) continue; if (MO.isDef()) { - if (Uses.count(MOReg)) + if (regOverlapsSet(Uses, MOReg, TRI)) // Physical register use would be clobbered. return false; - if (!MO.isDead() && Defs.count(MOReg)) + if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI)) // May clobber a physical register def. // FIXME: This may be too conservative. It's ok if the instruction // is sunken completely below the use. return false; } else { - if (Defs.count(MOReg)) + if (regOverlapsSet(Defs, MOReg, TRI)) return false; bool isKill = MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS)); - if (MOReg != Reg && - ((isKill && Uses.count(MOReg)) || Kills.count(MOReg))) + if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) || + regOverlapsSet(Kills, MOReg, TRI))) // Don't want to extend other live ranges and update kills. return false; if (MOReg == Reg && !isKill) diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h index 267a0da..d58bff5 100644 --- a/lib/IR/AttributeImpl.h +++ b/lib/IR/AttributeImpl.h @@ -19,8 +19,8 @@ #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Optional.h" #include "llvm/IR/Attributes.h" +#include "AttributeSetNode.h" #include "llvm/Support/DataTypes.h" -#include "llvm/Support/TrailingObjects.h" #include #include @@ -142,73 +142,6 @@ public: StringRef getStringValue() const { return Val; } }; -//===----------------------------------------------------------------------===// -/// \class -/// \brief This class represents a group of attributes that apply to one -/// element: function, return type, or parameter. -class AttributeSetNode final - : public FoldingSetNode, - private TrailingObjects { - friend TrailingObjects; - - unsigned NumAttrs; ///< Number of attributes in this node. - /// Bitset with a bit for each available attribute Attribute::AttrKind. - uint64_t AvailableAttrs; - - AttributeSetNode(ArrayRef Attrs) - : NumAttrs(Attrs.size()), AvailableAttrs(0) { - static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT, - "Too many attributes for AvailableAttrs"); - // There's memory after the node where we can store the entries in. - std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects()); - - for (Attribute I : *this) { - if (!I.isStringAttribute()) { - AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum(); - } - } - } - - // AttributesSetNode is uniqued, these should not be publicly available. - void operator=(const AttributeSetNode &) = delete; - AttributeSetNode(const AttributeSetNode &) = delete; -public: - void operator delete(void *p) { ::operator delete(p); } - - static AttributeSetNode *get(LLVMContext &C, ArrayRef Attrs); - - /// \brief Return the number of attributes this AttributeSet contains. - unsigned getNumAttributes() const { return NumAttrs; } - - bool hasAttribute(Attribute::AttrKind Kind) const { - return AvailableAttrs & ((uint64_t)1) << Kind; - } - bool hasAttribute(StringRef Kind) const; - bool hasAttributes() const { return NumAttrs != 0; } - - Attribute getAttribute(Attribute::AttrKind Kind) const; - Attribute getAttribute(StringRef Kind) const; - - unsigned getAlignment() const; - unsigned getStackAlignment() const; - uint64_t getDereferenceableBytes() const; - uint64_t getDereferenceableOrNullBytes() const; - std::pair> getAllocSizeArgs() const; - std::string getAsString(bool InAttrGrp) const; - - typedef const Attribute *iterator; - iterator begin() const { return getTrailingObjects(); } - iterator end() const { return begin() + NumAttrs; } - - void Profile(FoldingSetNodeID &ID) const { - Profile(ID, makeArrayRef(begin(), end())); - } - static void Profile(FoldingSetNodeID &ID, ArrayRef AttrList) { - for (unsigned I = 0, E = AttrList.size(); I != E; ++I) - AttrList[I].Profile(ID); - } -}; - typedef std::pair IndexAttrPair; //===----------------------------------------------------------------------===// diff --git a/lib/IR/AttributeSetNode.h b/lib/IR/AttributeSetNode.h new file mode 100644 index 0000000..fab1ed5 --- /dev/null +++ b/lib/IR/AttributeSetNode.h @@ -0,0 +1,98 @@ +//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the node class used internally by AttributeSet. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_ATTRIBUTESETNODE_H +#define LLVM_IR_ATTRIBUTESETNODE_H + +#include "llvm/ADT/FoldingSet.h" +#include "llvm/IR/Attributes.h" +#include "llvm/Support/TrailingObjects.h" +#include + +namespace llvm { + +//===----------------------------------------------------------------------===// +/// \class +/// \brief This class represents a group of attributes that apply to one +/// element: function, return type, or parameter. +class AttributeSetNode final + : public FoldingSetNode, + private TrailingObjects { + friend TrailingObjects; + + unsigned NumAttrs; ///< Number of attributes in this node. + /// Bitset with a bit for each available attribute Attribute::AttrKind. + uint64_t AvailableAttrs; + + AttributeSetNode(ArrayRef Attrs) + : NumAttrs(Attrs.size()), AvailableAttrs(0) { + static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT, + "Too many attributes for AvailableAttrs"); + // There's memory after the node where we can store the entries in. + std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects()); + + for (Attribute I : *this) { + if (!I.isStringAttribute()) { + AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum(); + } + } + } + + // AttributesSetNode is uniqued, these should not be publicly available. + void operator=(const AttributeSetNode &) = delete; + AttributeSetNode(const AttributeSetNode &) = delete; +public: + void operator delete(void *p) { ::operator delete(p); } + + static AttributeSetNode *get(LLVMContext &C, ArrayRef Attrs); + + static AttributeSetNode *get(AttributeSet AS, unsigned Index) { + return AS.getAttributes(Index); + } + + /// \brief Return the number of attributes this AttributeSet contains. + unsigned getNumAttributes() const { return NumAttrs; } + + bool hasAttribute(Attribute::AttrKind Kind) const { + return AvailableAttrs & ((uint64_t)1) << Kind; + } + bool hasAttribute(StringRef Kind) const; + bool hasAttributes() const { return NumAttrs != 0; } + + Attribute getAttribute(Attribute::AttrKind Kind) const; + Attribute getAttribute(StringRef Kind) const; + + unsigned getAlignment() const; + unsigned getStackAlignment() const; + uint64_t getDereferenceableBytes() const; + uint64_t getDereferenceableOrNullBytes() const; + std::pair> getAllocSizeArgs() const; + std::string getAsString(bool InAttrGrp) const; + + typedef const Attribute *iterator; + iterator begin() const { return getTrailingObjects(); } + iterator end() const { return begin() + NumAttrs; } + + void Profile(FoldingSetNodeID &ID) const { + Profile(ID, makeArrayRef(begin(), end())); + } + static void Profile(FoldingSetNodeID &ID, ArrayRef AttrList) { + for (unsigned I = 0, E = AttrList.size(); I != E; ++I) + AttrList[I].Profile(ID); + } +}; + +} // end llvm namespace + +#endif diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 431e51b..2e4a2f8 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "sse2.cvtps2pd" || Name == "avx.cvtdq2.pd.256" || Name == "avx.cvt.ps2.pd.256" || - Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt.") || Name.startswith("avx.vinsertf128.") || Name == "avx2.vinserti128" || Name.startswith("avx.vextractf128.") || @@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd"); else Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd"); - } else if (IsX86 && (Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt."))) { - // Truncation (round to zero) float/double to i32 vector conversion. - Value *Src = CI->getArgOperand(0); - VectorType *DstTy = cast(CI->getType()); - Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt"); } else if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); SmallVector Elts; diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index a553614..3c4b0cf 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/Attributes.h" +#include "AttributeSetNode.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -1844,6 +1845,18 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unwrap(F)->addAttribute(Idx, unwrap(A)); } +unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) { + auto *ASN = AttributeSetNode::get(unwrap(F)->getAttributes(), Idx); + return ASN->getNumAttributes(); +} + +void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs) { + auto *ASN = AttributeSetNode::get(unwrap(F)->getAttributes(), Idx); + for (auto A: make_range(ASN->begin(), ASN->end())) + *Attrs++ = wrap(A); +} + LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unsigned KindID) { @@ -2216,6 +2229,21 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, CallSite(unwrap(C)).addAttribute(Idx, unwrap(A)); } +unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, + LLVMAttributeIndex Idx) { + auto CS = CallSite(unwrap(C)); + auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx); + return ASN->getNumAttributes(); +} + +void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs) { + auto CS = CallSite(unwrap(C)); + auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx); + for (auto A: make_range(ASN->begin(), ASN->end())) + *Attrs++ = wrap(A); +} + LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, unsigned KindID) { diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp index 5201c2e..f35c64b 100644 --- a/lib/IR/Metadata.cpp +++ b/lib/IR/Metadata.cpp @@ -675,8 +675,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) { Metadata *Old = getOperand(Op); setOperand(Op, New); - // Drop uniquing for self-reference cycles. - if (New == this) { + // Drop uniquing for self-reference cycles and deleted constants. + if (New == this || (!New && Old && isa(Old))) { if (!isResolved()) resolve(); storeDistinctInContext(); diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index cfa12a9..2bac2a3 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -201,6 +201,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) { switch (Kind) { case UnknownEnvironment: return "unknown"; case GNU: return "gnu"; + case GNUABI64: return "gnuabi64"; case GNUEABIHF: return "gnueabihf"; case GNUEABI: return "gnueabi"; case GNUX32: return "gnux32"; @@ -468,6 +469,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { return StringSwitch(EnvironmentName) .StartsWith("eabihf", Triple::EABIHF) .StartsWith("eabi", Triple::EABI) + .StartsWith("gnuabi64", Triple::GNUABI64) .StartsWith("gnueabihf", Triple::GNUEABIHF) .StartsWith("gnueabi", Triple::GNUEABI) .StartsWith("gnux32", Triple::GNUX32) diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index b1e8816..b97a0f1 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -250,6 +250,7 @@ def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", FeatureMacroOpFusion, FeatureNEON, FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, HasV8_1aOps]>; def : ProcessorModel<"generic", NoSchedModel, [ diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index d6f2a19..ac7de1b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); @@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: - ResTy = MVT::v4i32; + ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } + if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) + return SDValue(); + + assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && + "Illegal vector type after legalization"); + SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs @@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - return performFpToIntCombine(N, DAG, Subtarget); + return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, Subtarget); case ISD::OR: diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 7e59710..d4784b5 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -20,6 +20,7 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class GCNTargetMachine; struct MachineSchedContext; class MCAsmInfo; class raw_ostream; @@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index cfe6346..c9c95c7 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) { emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion, RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2); if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, - RuntimeMD::OpenCL_C, 1); - auto Node = MD->getOperand(0); - unsigned short Major = mdconst::extract(Node->getOperand(0)) - ->getZExtValue(); - unsigned short Minor = mdconst::extract(Node->getOperand(1)) - ->getZExtValue(); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, - Major * 100 + Minor * 10, 2); + if (MD->getNumOperands()) { + auto Node = MD->getOperand(0); + if (Node->getNumOperands() > 1) { + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, + RuntimeMD::OpenCL_C, 1); + uint16_t Major = mdconst::extract(Node->getOperand(0)) + ->getZExtValue(); + uint16_t Minor = mdconst::extract(Node->getOperand(1)) + ->getZExtValue(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, + Major * 100 + Minor * 10, 2); + } + } } } diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 3b41577..b955e23 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" @@ -30,15 +32,28 @@ using namespace llvm; namespace { class AMDGPUCodeGenPrepare : public FunctionPass, - public InstVisitor { + public InstVisitor { + const GCNTargetMachine *TM; + const SISubtarget *ST; DivergenceAnalysis *DA; - const TargetMachine *TM; + Module *Mod; + bool HasUnsafeFPMath; public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), - TM(TM) { } + TM(static_cast(TM)), + ST(nullptr), + DA(nullptr), + Mod(nullptr), + HasUnsafeFPMath(false) { } + + bool visitFDiv(BinaryOperator &I); + + bool visitInstruction(Instruction &I) { + return false; + } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -55,7 +70,92 @@ public: } // End anonymous namespace +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { + const ConstantFP *CNum = dyn_cast(Num); + if (!CNum) + return false; + + // Reciprocal f32 is handled separately without denormals. + return UnsafeDiv || CNum->isExactlyValue(+1.0); +} + +// Insert an intrinsic for fast fdiv for safe math situations where we can +// reduce precision. Leave fdiv for situations where the generic node is +// expected to be optimized. +bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { + Type *Ty = FDiv.getType(); + + // TODO: Handle half + if (!Ty->getScalarType()->isFloatTy()) + return false; + + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + if (!FPMath) + return false; + + const FPMathOperator *FPOp = cast(&FDiv); + float ULP = FPOp->getFPAccuracy(); + if (ULP < 2.5f) + return false; + + FastMathFlags FMF = FPOp->getFastMathFlags(); + bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + FMF.allowReciprocal(); + if (ST->hasFP32Denormals() && !UnsafeDiv) + return false; + + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); + + const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); + Function *Decl + = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *NewFDiv = nullptr; + + if (VectorType *VT = dyn_cast(Ty)) { + NewFDiv = UndefValue::get(VT); + + // FIXME: Doesn't do the right thing for cases where the vector is partially + // constant. This works when the scalarizer pass is run first. + for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { + Value *NumEltI = Builder.CreateExtractElement(Num, I); + Value *DenEltI = Builder.CreateExtractElement(Den, I); + Value *NewElt; + + if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { + NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + } else { + NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); + } + + NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); + } + } else { + if (!shouldKeepFDivF32(Num, UnsafeDiv)) + NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } + + if (NewFDiv) { + FDiv.replaceAllUsesWith(NewFDiv); + NewFDiv->takeName(&FDiv); + FDiv.eraseFromParent(); + } + + return true; +} + +static bool hasUnsafeFPMath(const Function &F) { + Attribute Attr = F.getFnAttribute("unsafe-fp-math"); + return Attr.getValueAsString() == "true"; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; return false; } @@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + ST = &TM->getSubtarget(F); DA = &getAnalysis(); - visit(F); + HasUnsafeFPMath = hasUnsafeFPMath(F); - return true; + bool MadeChange = false; + + for (BasicBlock &BB : F) { + BasicBlock::iterator Next; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + Next = std::next(I); + MadeChange |= visit(*I); + } + } + + return MadeChange; } INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, @@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { return new AMDGPUCodeGenPrepare(TM); } diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 6761b4b..3944fdb 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding -int FP32_NEG_ONE = 0xbf800000; int FP32_ONE = 0x3f800000; +int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; +int FP64_NEG_ONE = 0xbff0000000000000; } def CONST : Constants; diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 791872a..8e3471b 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = { #undef GET_INTRINSIC_NAME_TABLE }; -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } +namespace { +#define GET_INTRINSIC_ATTRIBUTES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES +} + +StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, + ArrayRef Tys) const { + if (IntrID < Intrinsic::num_intrinsics) + return StringRef(); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); - return Result; + return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; +} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned NumTys) const { + return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); +} + +FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys) const { + // FIXME: Re-use Intrinsic::getType machinery + switch (ID) { + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + Type *F32Ty = Type::getFloatTy(Context); + return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); + } + default: + llvm_unreachable("unhandled intrinsic"); + } } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { } Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + ArrayRef Tys) const { + FunctionType *FTy = getType(M->getContext(), IntrID, Tys); + Function *F + = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); + + AttributeSet AS = getAttributes(M->getContext(), + static_cast(IntrID)); + F->setAttributes(AS); + return F; +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); + unsigned NumTys) const { + return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index f417392..6cb8b96 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -34,13 +34,23 @@ enum ID { class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); + + StringRef getName(unsigned IntrId, ArrayRef Tys = None) const; + std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; bool isOverloaded(unsigned IID) const override; Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + + Function *getDeclaration(Module *M, unsigned ID, + ArrayRef = None) const; + + FunctionType *getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys = None) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 7754638..0bad63f 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) { static Value * calculateVectorIndex(Value *Ptr, const std::map &GEPIdx) { - if (isa(Ptr)) - return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); - GetElementPtrInst *GEP = cast(Ptr); auto I = GEPIdx.find(GEP); @@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr, static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // FIXME we only support simple cases if (GEP->getNumOperands() != 3) - return NULL; + return nullptr; ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); if (!I0 || !I0->isZero()) - return NULL; + return nullptr; return GEP->getOperand(2); } @@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { // are just being conservative for now. if (!AllocaTy || AllocaTy->getElementType()->isVectorTy() || - AllocaTy->getNumElements() > 4) { + AllocaTy->getNumElements() > 4 || + AllocaTy->getNumElements() < 2) { DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } @@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = Inst->getOperand(0); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); Inst->replaceAllUsesWith(ExtractElement); @@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; } case Instruction::Store: { + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Value *Ptr = Inst->getOperand(1); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, Inst->getOperand(0), @@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; default: - Inst->dump(); llvm_unreachable("Inconsistency in instructions promotable to vector"); } } @@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { } static bool isCallPromotable(CallInst *CI) { - // TODO: We might be able to handle some cases where the callee is a - // constantexpr bitcast of a function. - if (!CI->getCalledFunction()) - return false; - IntrinsicInst *II = dyn_cast(CI); if (!II) return false; @@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { continue; } - IntrinsicInst *Intr = dyn_cast(Call); - if (!Intr) { - // FIXME: What is this for? It doesn't make sense to promote arbitrary - // function calls. If the call is to a defined function that can also be - // promoted, we should be able to do this once that function is also - // rewritten. - - std::vector ArgTypes; - for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); - ArgIdx != ArgEnd; ++ArgIdx) { - ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); - } - Function *F = Call->getCalledFunction(); - FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, - F->isVarArg()); - Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), - NewType, F->getAttributes()); - Function *NewF = cast(C); - Call->setCalledFunction(NewF); - continue; - } - + IntrinsicInst *Intr = cast(Call); Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3e53f52..b2d4e11 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -309,6 +309,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; + void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; bool addInstSelector() override; @@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&DeadMachineInstructionElimID); } +void GCNPassConfig::addIRPasses() { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); + + AMDGPUPassConfig::addIRPasses(); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 8f78edd..8ccd176 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); @@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::FP_TO_UINT: if (N->getValueType(0) == MVT::i1) { - Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); return; } // Fall-through. Since we don't care about out of bounds values // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint // considers some extra cases which are not necessary here. case ISD::FP_TO_SINT: { + if (N->getValueType(0) == MVT::i1) { + Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); + return; + } + SDValue Result; if (expandFP_TO_SINT(N, Result, DAG)) Results.push_back(Result); @@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); } -SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { +SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode( + ISD::SETCC, + DL, + MVT::i1, + Op, DAG.getConstantFP(1.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETEQ)); +} + +SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode( ISD::SETCC, DL, MVT::i1, - Op, DAG.getConstantFP(0.0f, DL, MVT::f32), - DAG.getCondCode(ISD::SETNE) - ); + Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETEQ)); } SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 2fb6ee2..9700ce1 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -72,7 +72,8 @@ private: SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 54efdc0..f4b04e3 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -41,7 +41,8 @@ enum { WQM = 1 << 22, VGPRSpill = 1 << 23, VOPAsmPrefer32Bit = 1 << 24, - Gather4 = 1 << 25 + Gather4 = 1 << 25, + DisableWQM = 1 << 26 }; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 51241cf..80d4435 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineFunction *MF = BB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); DebugLoc DL = MI.getDebugLoc(); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) - .addOperand(MI.getOperand(0)) - .addImm(MFI->LDSSize); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) + .addOperand(MI.getOperand(0)) + .addImm(MFI->LDSSize); MI.eraseFromParent(); return BB; } @@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + return lowerFDIV_FAST(Op, DAG); + } case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), @@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Catch division cases where we can use shortcuts with rcp and rsq // instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) - return FastLowered; - +// Faster 2.5 ULP division that does not support denormals. +SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); - // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag - if (EnableAMDGPUFastFDIV) { - // This does not support denormals. - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - // TODO: Should this propagate fast-math-flags? + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); - } +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); @@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); + return lowerFastUnsafeFDIV(Op, DAG); SDLoc SL(Op); SDValue X = Op.getOperand(0); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 8e055ee..1d349fa 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 2f63d4e..6163f05 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,8 @@ class InstSI DS = 0; field bits<1> MIMG = 0; field bits<1> FLAT = 0; + + // Whether WQM _must_ be enabled for this instruction. field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; @@ -50,6 +52,9 @@ class InstSI Gather4 = 0; + // Whether WQM _must_ be disabled for this instruction. + field bits<1> DisableWQM = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -81,6 +86,7 @@ class InstSI getDebugLoc(); - TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, + *MF); if (TIDReg == AMDGPU::NoRegister) return TIDReg; diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 227b817..fef8904 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -340,6 +340,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isDisableWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM; + } + + bool isDisableWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DisableWQM; + } + static bool isVGPRSpill(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 253cc32..00f53e8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -2949,6 +2949,10 @@ multiclass MUBUF_m , MUBUFAddr64Table <0>; + let DisableWQM = 1 in { + def "_exact" : MUBUF_Pseudo ; + } + let addr64 = 0, isCodeGenOnly = 0 in { def _si : MUBUF_Real_si ; } @@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1, + DisableWQM = 1 in { // No return variants let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { @@ -3423,6 +3428,7 @@ class MIMG_Store_Helper op, string asm, let mayStore = 1; let hasSideEffects = 1; let hasPostISelHook = 0; + let DisableWQM = 1; } multiclass MIMG_Store_Addr_Helper op, string asm, @@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper (opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat(opcode # _BOTHEN) + (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), @@ -3391,6 +3391,16 @@ def : Pat < (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; +class FPToI1Pat : Pat < + (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), + (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) +>; + +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; + // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector // comparisions still write to a pair of SGPRs, so treat these as diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index a9b7c39..9d06ccf 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// SI Intrinsic Definitions +// Backend internal SI Intrinsic Definitions. User code should not +// directly use these. // //===----------------------------------------------------------------------===// @@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in { } // End TargetPrefix = "SI", isTarget = 1 let TargetPrefix = "amdgcn", isTarget = 1 in { + // Emit 2.5 ulp, no denormal division. Should only be inserted by + // pass based on !fpmath metadata. + def int_amdgcn_fdiv_fast : Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + >; + /* Control flow Intrinsics */ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 4d12a1e..848be32 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( Spill.Lane = Lane; if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, + *MF); if (LaneVGPR == AMDGPU::NoRegister) // We have no VGPRs left for spilling SGPRs. diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 0dd88ee..347c33f 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. -unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const { +unsigned +SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineFunction &MF) const { + for (unsigned Reg : *RC) - if (!MRI.isPhysRegUsed(Reg)) + if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) return Reg; return AMDGPU::NoRegister; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 6e97b1b..d8b2d9f 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -185,7 +185,8 @@ public: unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const; + const TargetRegisterClass *RC, + const MachineFunction &MF) const; unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index c1a237e..b200c15 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -94,12 +94,15 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + LiveIntervals *LIS; DenseMap Instructions; DenseMap Blocks; SmallVector ExecExports; SmallVector LiveMaskQueries; + void markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); @@ -126,6 +129,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -135,8 +139,11 @@ public: char SIWholeQuadMode::ID = 0; -INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, - "SI Whole Quad Mode", false, false) +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; @@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } +void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist) { + InstrInfo &II = Instructions[&MI]; + + assert(Flag == StateWQM || Flag == StateExact); + + // Ignore if the instruction is already marked. The typical case is that we + // mark an instruction WQM multiple times, but for atomics it can happen that + // Flag is StateWQM, but Needs is already set to StateExact. In this case, + // letting the atomic run in StateExact is correct as per the relevant specs. + if (II.Needs) + return; + + II.Needs = Flag; + Worklist.push_back(&MI); +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; - } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { + } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set @@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, continue; } - Instructions[&MI].Needs = Flags; - Worklist.push_back(&MI); + markInstruction(MI, Flags, Worklist); GlobalFlags |= Flags; } @@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references BlockInfo &BI = Blocks[MBB]; - // Control flow-type instructions that are followed by WQM computations - // must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { + // Control flow-type instructions and stores to temporary memory that are + // followed by WQM computations must themselves be in WQM. + if ((II.OutNeeds & StateWQM) && !II.Needs && + (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } @@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, if (!Use.isReg() || !Use.isUse()) continue; - // At this point, physical registers appear as inputs or outputs - // and following them makes no sense (and would in fact be incorrect - // when the same VGPR is used as both an output and an input that leads - // to a NeedsWQM instruction). - // - // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we - // have to trace this, in practice it happens for 64-bit computations like - // pointers where both dwords are followed already anyway. - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) - continue; - - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { - InstrInfo &DefII = Instructions[&DefMI]; + unsigned Reg = Use.getReg(); - // Obviously skip if DefMI is already flagged as NeedWQM. - // - // The instruction might also be flagged as NeedExact. This happens when - // the result of an atomic is used in a WQM computation. In this case, - // the atomic must not run for helper pixels and the WQM result is - // undefined. - if (DefII.Needs != 0) + // Handle physical registers that we need to track; this is mostly relevant + // for VCC, which can appear as the (implicit) input of a uniform branch, + // e.g. when a loop counter is stored in a VGPR. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Reg == AMDGPU::EXEC) continue; - DefII.Needs = StateWQM; - Worklist.push_back(&DefMI); + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) + continue; + + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + + markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + Worklist); + } + + continue; } + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, StateWQM, Worklist); } } @@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); + LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); if (!(GlobalFlags & StateWQM)) { diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index d6e7caf..3cfcb1e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; - if (isSaturatingConditional(Op, SatValue, SatConstant)) + if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && + isSaturatingConditional(Op, SatValue, SatConstant)) return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 060376b..c9735f3 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), def SSAT : AI<(outs GPRnopc:$Rd), (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), - SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<5> sat_imm; bits<4> Rn; @@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd), def SSAT16 : AI<(outs GPRnopc:$Rd), (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm, - NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> { + NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; @@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd), def USAT : AI<(outs GPRnopc:$Rd), (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), - SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsARM,HasV6]> { bits<4> Rd; bits<5> sat_imm; bits<4> Rn; @@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd), def USAT16 : AI<(outs GPRnopc:$Rd), (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm, - NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> { + NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 55e5308..fe699b2 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2240,7 +2240,8 @@ class T2SatI { + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI< def t2SSAT16: T2SatI< (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI< def t2USAT: T2SatI< (outs rGPR:$Rd), (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), - NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1110; let Inst{20} = 0; @@ -2275,7 +2277,7 @@ def t2USAT: T2SatI< def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-22} = 0b1111001110; let Inst{20} = 0; let Inst{15} = 0; diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index cdad7ce..20c5f36 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, return true; return false; + case ELF::R_MIPS_GOT_PAGE: + case ELF::R_MICROMIPS_GOT_PAGE: + case ELF::R_MIPS_GOT_OFST: + case ELF::R_MICROMIPS_GOT_OFST: case ELF::R_MIPS_16: case ELF::R_MIPS_32: case ELF::R_MIPS_GPREL32: @@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, case ELF::R_MIPS_SHIFT5: case ELF::R_MIPS_SHIFT6: case ELF::R_MIPS_GOT_DISP: - case ELF::R_MIPS_GOT_PAGE: - case ELF::R_MIPS_GOT_OFST: case ELF::R_MIPS_GOT_HI16: case ELF::R_MIPS_GOT_LO16: case ELF::R_MIPS_INSERT_A: @@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, case ELF::R_MICROMIPS_PC16_S1: case ELF::R_MICROMIPS_CALL16: case ELF::R_MICROMIPS_GOT_DISP: - case ELF::R_MICROMIPS_GOT_PAGE: - case ELF::R_MICROMIPS_GOT_OFST: case ELF::R_MICROMIPS_GOT_HI16: case ELF::R_MICROMIPS_GOT_LO16: case ELF::R_MICROMIPS_SUB: diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 1622b22..1ce8f07 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) { PointerSize = CalleeSaveStackSlotSize = 8; } + // FIXME: This condition isn't quite right but it's the best we can do until + // this object can identify the ABI. It will misbehave when using O32 + // on a mips64*-* triple. + if ((TheTriple.getArch() == Triple::mipsel) || + (TheTriple.getArch() == Triple::mips)) { + PrivateGlobalPrefix = "$"; + PrivateLabelPrefix = "$"; + } + AlignmentIsInBytes = false; Data16bitsDirective = "\t.2byte\t"; Data32bitsDirective = "\t.4byte\t"; Data64bitsDirective = "\t.8byte\t"; - PrivateGlobalPrefix = "$"; - PrivateLabelPrefix = "$"; CommentString = "#"; ZeroDirective = "\t.space\t"; GPRel32Directive = "\t.gpword\t"; diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index c248c3a..80641ed 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, else Ret += "E"; - Ret += "-m:m"; + if (ABI.IsO32()) + Ret += "-m:m"; + else + Ret += "-m:e"; // Pointers are 32 bit on some ABIs. if (!ABI.IsN64()) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e547111..2c54838 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); @@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, @@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { + if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src)); MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); @@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); + if (SVT.getVectorElementType() == MVT::i1) { + if (SVT == MVT::v2i1) + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0)); + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); + } + switch (SVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 1672b38..5f0aab9 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp) const { + MachineOperand &ImplicitOp, + LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); const TargetRegisterClass *RC; if (AllowSP) { @@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + get(TargetOpcode::COPY)) .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) .addOperand(Src); // Which is obviously going to be dead after we're done with it. isKill = true; isUndef = false; + + if (LV) + LV->replaceKillInstruction(SrcReg, MI, *Copy); } // We've set all the parameters without issue. @@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = @@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = @@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) @@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; const MachineOperand &Src2 = MI.getOperand(2); @@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, - SrcReg2, isKill2, isUndef2, ImplicitOp2)) + SrcReg2, isKill2, isUndef2, ImplicitOp2, LV)) return nullptr; MachineInstrBuilder MIB = @@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 858f35d..a8a9f62 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -230,7 +230,7 @@ public: bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp) const; + MachineOperand &ImplicitOp, LiveVariables *LV) const; /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9a98f5c..f91764a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; -def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, +def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; -def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, +def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (loadv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), @@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index fff5440..787f434 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -332,6 +332,7 @@ struct ArgumentUsesTracker : public CaptureTracker { namespace llvm { template <> struct GraphTraits { typedef ArgumentGraphNode NodeType; + typedef ArgumentGraphNode *NodeRef; typedef SmallVectorImpl::iterator ChildIteratorType; static inline NodeType *getEntryNode(NodeType *A) { return A; } diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 310c292..99b12d4 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -44,6 +44,7 @@ #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/Local.h" #include using namespace llvm; @@ -779,7 +780,8 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, // Instructions could multiply use V. while (UI != E && *UI == I) ++UI; - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) + I->eraseFromParent(); } } diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index cf5b76d..df6a48e 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -134,6 +134,10 @@ static cl::opt PreInlineThreshold( cl::desc("Control the amount of inlining in pre-instrumentation inliner " "(default = 75)")); +static cl::opt EnableGVNHoist( + "enable-gvn-hoist", cl::init(false), cl::Hidden, + cl::desc("Enable the experimental GVN Hoisting pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -232,7 +236,8 @@ void PassManagerBuilder::populateFunctionPassManager( FPM.add(createCFGSimplificationPass()); FPM.add(createSROAPass()); FPM.add(createEarlyCSEPass()); - FPM.add(createGVNHoistPass()); + if(EnableGVNHoist) + FPM.add(createGVNHoistPass()); FPM.add(createLowerExpectIntrinsicPass()); } diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index d7eed79..8f1ff8a 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -553,8 +553,11 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } } + // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring + // decomposeBitTestICmp() might help. { - unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType()); + unsigned BitWidth = + DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); APInt MinSignedValue = APInt::getSignBit(BitWidth); Value *X; const APInt *Y, *C; diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 51c3262..377ccb9 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2830,7 +2830,8 @@ bool InstCombiner::run() { // Add operands to the worklist. replaceInstUsesWith(*I, C); ++NumConstProp; - eraseInstFromFunction(*I); + if (isInstructionTriviallyDead(I, TLI)) + eraseInstFromFunction(*I); MadeIRChange = true; continue; } @@ -2851,7 +2852,8 @@ bool InstCombiner::run() { // Add operands to the worklist. replaceInstUsesWith(*I, C); ++NumConstProp; - eraseInstFromFunction(*I); + if (isInstructionTriviallyDead(I, TLI)) + eraseInstFromFunction(*I); MadeIRChange = true; continue; } @@ -3007,7 +3009,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, << *Inst << '\n'); Inst->replaceAllUsesWith(C); ++NumConstProp; - Inst->eraseFromParent(); + if (isInstructionTriviallyDead(Inst, TLI)) + Inst->eraseFromParent(); continue; } diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index dcb62d3..41041c7 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -272,8 +272,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) { return false; } - // Check if the global is in a GCOV counter array. - if (GV->getName().startswith("__llvm_gcov_ctr")) + // Check if the global is private gcov data. + if (GV->getName().startswith("__llvm_gcov") || + GV->getName().startswith("__llvm_gcda")) return false; } diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 88172d1..9e98219 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" @@ -90,11 +91,13 @@ bool ConstantPropagation::runOnFunction(Function &F) { // Remove the dead instruction. WorkList.erase(I); - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) { + I->eraseFromParent(); + ++NumInstKilled; + } // We made a change to the function... Changed = true; - ++NumInstKilled; } } return Changed; diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 9d0ef42..0b16e27 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -582,6 +582,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // its simpler value. if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); + bool Killed = false; if (!Inst->use_empty()) { Inst->replaceAllUsesWith(V); Changed = true; @@ -589,11 +590,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (isInstructionTriviallyDead(Inst, &TLI)) { Inst->eraseFromParent(); Changed = true; + Killed = true; } - if (Changed) { + if (Changed) ++NumSimplify; + if (Killed) continue; - } } // If this is a simple instruction that we can value number, process it. diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 542cf38..e958563 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -815,6 +815,14 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) return; + // Check that `Cast` actually extends the induction variable (we rely on this + // later). This takes care of cases where `Cast` is extending a truncation of + // the narrow induction variable, and thus can end up being narrower than the + // "narrow" induction variable. + uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); + if (NarrowIVWidth >= Width) + return; + // Cast is either an sext or zext up to this point. // We should not widen an indvar if arithmetics on the wider indvar are more // expensive than those on the narrower indvar. We check only the cost of ADD diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index b9e717c..d1769fc 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -758,7 +758,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) + I->eraseFromParent(); Condition = SimpleVal; } } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 2c0a70e..cdd17fc 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -377,9 +377,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, &I, I.getModule()->getDataLayout(), TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); - CurAST->deleteValue(&I); I.replaceAllUsesWith(C); - I.eraseFromParent(); + if (isInstructionTriviallyDead(&I, TLI)) { + CurAST->deleteValue(&I); + I.eraseFromParent(); + } continue; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 77c77eb..70bd9d3 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -4442,6 +4442,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Determine an input position which will be dominated by the operands and // which will dominate the result. IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); + Rewriter.setInsertPoint(&*IP); // Inform the Rewriter if we have a post-increment use, so that it can // perform an advantageous expansion. @@ -4473,7 +4474,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); } // Expand the ScaledReg portion. @@ -4491,14 +4492,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand ScaleReg as if it was part of the base regs. if (F.Scale == 1) Ops.push_back( - SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP))); + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr))); else { // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr); } } else { // Otherwise just expand the scaled register and an explicit scale, @@ -4508,11 +4509,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)); if (F.Scale != 1) ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4524,7 +4525,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4534,7 +4535,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Flush the operand list to suppress SCEVExpander hoisting of both folded and // unfolded offsets. LSR assumes they both live next to their uses. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4570,7 +4571,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); - Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty); // We're done expanding now, so reset the rewriter. Rewriter.clearPostInc(); diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index c5ca563..4f1052d 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -552,9 +553,39 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // two PHINodes, the iteration over the old PHIs remains valid, and the // mapping will just map us to the new node (which may not even be a PHI // node). + const DataLayout &DL = NewFunc->getParent()->getDataLayout(); + SmallSetVector Worklist; for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) - if (PHINode *PN = dyn_cast(VMap[PHIToResolve[Idx]])) - recursivelySimplifyInstruction(PN); + if (isa(VMap[PHIToResolve[Idx]])) + Worklist.insert(PHIToResolve[Idx]); + + // Note that we must test the size on each iteration, the worklist can grow. + for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { + const Value *OrigV = Worklist[Idx]; + auto *I = dyn_cast_or_null(VMap.lookup(OrigV)); + if (!I) + continue; + + // See if this instruction simplifies. + Value *SimpleV = SimplifyInstruction(I, DL); + if (!SimpleV) + continue; + + // Stash away all the uses of the old instruction so we can check them for + // recursive simplifications after a RAUW. This is cheaper than checking all + // uses of To on the recursive step in most cases. + for (const User *U : OrigV->users()) + Worklist.insert(cast(U)); + + // Replace the instruction with its simplified value. + I->replaceAllUsesWith(SimpleV); + + // If the original instruction had no side effects, remove it. + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + else + VMap[OrigV] = I; + } // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happens all the time when diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 1fbb19d..e82c07f 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1294,6 +1294,13 @@ updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode, return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); } +/// Return the result of AI->isStaticAlloca() if AI were moved to the entry +/// block. Allocas used in inalloca calls and allocas of dynamic array size +/// cannot be static. +static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) { + return isa(AI->getArraySize()) && !AI->isUsedWithInAlloca(); +} + /// Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, @@ -1328,7 +1335,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // Don't update static allocas, as they may get moved later. if (auto *AI = dyn_cast(BI)) - if (isa(AI->getArraySize())) + if (allocaWouldBeStaticInEntry(AI)) continue; BI->setDebugLoc(TheCallDL); @@ -1626,7 +1633,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, continue; } - if (!isa(AI->getArraySize())) + if (!allocaWouldBeStaticInEntry(AI)) continue; // Keep track of the static allocas that we inline into the caller. @@ -1635,7 +1642,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Scan for the block of allocas that we can move over, and move them // all at once. while (isa(I) && - isa(cast(I)->getArraySize())) { + allocaWouldBeStaticInEntry(cast(I))) { IFI.StaticAllocas.push_back(cast(I)); ++I; } diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index 9658966..0d5a25b 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -64,6 +64,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, DominatorTree &DT, LoopInfo &LI) { SmallVector UsesToRewrite; SmallVector ExitBlocks; + SmallSetVector PHIsToRemove; PredIteratorCache PredCache; bool Changed = false; @@ -115,7 +116,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, SmallVector AddedPHIs; SmallVector PostProcessPHIs; - SSAUpdater SSAUpdate; + SmallVector InsertedPHIs; + SSAUpdater SSAUpdate(&InsertedPHIs); SSAUpdate.Initialize(I->getType(), I->getName()); // Insert the LCSSA phi's into all of the exit blocks dominated by the @@ -184,6 +186,14 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, // Otherwise, do full PHI insertion. SSAUpdate.RewriteUse(*UseToRewrite); + + // SSAUpdater might have inserted phi-nodes inside other loops. We'll need + // to post-process them to keep LCSSA form. + for (PHINode *InsertedPN : InsertedPHIs) { + if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent())) + if (!L->contains(OtherLoop)) + PostProcessPHIs.push_back(InsertedPN); + } } // Post process PHI instructions that were inserted into another disjoint @@ -196,13 +206,19 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, Worklist.push_back(PostProcessPN); } - // Remove PHI nodes that did not have any uses rewritten. + // Keep track of PHI nodes that we want to remove because they did not have + // any uses rewritten. for (PHINode *PN : AddedPHIs) if (PN->use_empty()) - PN->eraseFromParent(); + PHIsToRemove.insert(PN); Changed = true; } + // Remove PHI nodes that did not have any uses rewritten. + for (PHINode *PN : PHIsToRemove) { + assert (PN->use_empty() && "Trying to remove a phi with uses."); + PN->eraseFromParent(); + } return Changed; } diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index b3a928b..2846e8f 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -327,6 +327,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, else NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I)); + SmallVector OuterLoopBlocks; + OuterLoopBlocks.push_back(NewBB); // Now that we know which blocks are in L and which need to be moved to // OuterLoop, move any blocks that need it. for (unsigned i = 0; i != L->getBlocks().size(); ++i) { @@ -334,12 +336,53 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (!BlocksInL.count(BB)) { // Move this block to the parent, updating the exit blocks sets L->removeBlockFromLoop(BB); - if ((*LI)[BB] == L) + if ((*LI)[BB] == L) { LI->changeLoopFor(BB, NewOuter); + OuterLoopBlocks.push_back(BB); + } --i; } } + // Split edges to exit blocks from the inner loop, if they emerged in the + // process of separating the outer one. + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + SmallSetVector ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + for (BasicBlock *ExitBlock : ExitBlockSet) { + if (any_of(predecessors(ExitBlock), + [L](BasicBlock *BB) { return !L->contains(BB); })) { + rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); + } + } + + if (PreserveLCSSA) { + // Fix LCSSA form for L. Some values, which previously were only used inside + // L, can now be used in NewOuter loop. We need to insert phi-nodes for them + // in corresponding exit blocks. + + // Go through all instructions in OuterLoopBlocks and check if they are + // using operands from the inner loop. In this case we'll need to fix LCSSA + // for these instructions. + SmallSetVector WorklistSet; + for (BasicBlock *OuterBB: OuterLoopBlocks) { + for (Instruction &I : *OuterBB) { + for (Value *Op : I.operands()) { + Instruction *OpI = dyn_cast(Op); + if (!OpI || !L->contains(OpI)) + continue; + WorklistSet.insert(OpI); + } + } + } + SmallVector Worklist(WorklistSet.begin(), + WorklistSet.end()); + formLCSSAForInstructions(Worklist, *DT, *LI); + assert(NewOuter->isRecursivelyLCSSAForm(*DT) && + "LCSSA is broken after separating nested loops!"); + } + return NewOuter; } @@ -541,17 +584,12 @@ ReprocessLoop: SmallSetVector ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); for (BasicBlock *ExitBlock : ExitBlockSet) { - for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock); - PI != PE; ++PI) - // Must be exactly this loop: no subloops, parent loops, or non-loop preds - // allowed. - if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) { - ++NumInserted; - Changed = true; - } - break; - } + if (any_of(predecessors(ExitBlock), + [L](BasicBlock *BB) { return !L->contains(BB); })) { + rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); + ++NumInserted; + Changed = true; + } } // If the header has more than two predecessors at this point (from the diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8b85e32..ee5733d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -50,6 +50,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -220,6 +221,81 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +// A traits type that is intended to be used in graph algorithms. The graph it +// models starts at the loop header, and traverses the BasicBlocks that are in +// the loop body, but not the loop header. Since the loop header is skipped, +// the back edges are excluded. +struct LoopBodyTraits { + using NodeRef = std::pair; + + // This wraps a const Loop * into the iterator, so we know which edges to + // filter out. + class WrappedSuccIterator + : public iterator_adaptor_base< + WrappedSuccIterator, succ_iterator, + typename std::iterator_traits::iterator_category, + NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + using BaseT = iterator_adaptor_base< + WrappedSuccIterator, succ_iterator, + typename std::iterator_traits::iterator_category, + NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>; + + const Loop *L; + + public: + WrappedSuccIterator(succ_iterator Begin, const Loop *L) + : BaseT(Begin), L(L) {} + + NodeRef operator*() const { return {L, *I}; } + }; + + struct LoopBodyFilter { + bool operator()(NodeRef N) const { + const Loop *L = N.first; + return N.second != L->getHeader() && L->contains(N.second); + } + }; + + using ChildIteratorType = + filter_iterator; + + static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; } + + static ChildIteratorType child_begin(NodeRef Node) { + return make_filter_range(make_range( + {succ_begin(Node.second), Node.first}, + {succ_end(Node.second), Node.first}), + LoopBodyFilter{}) + .begin(); + } + + static ChildIteratorType child_end(NodeRef Node) { + return make_filter_range(make_range( + {succ_begin(Node.second), Node.first}, + {succ_end(Node.second), Node.first}), + LoopBodyFilter{}) + .end(); + } +}; + +/// Returns true if the given loop body has a cycle, excluding the loop +/// itself. +static bool hasCyclesInLoopBody(const Loop &L) { + if (!L.empty()) + return true; + + for (const auto SCC : + make_range(scc_iterator::begin(L), + scc_iterator::end(L))) { + if (SCC.size() > 1) { + DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); + DEBUG(L.dump()); + return true; + } + } + return false; +} + /// \brief This modifies LoopAccessReport to initialize message with /// loop-vectorizer-specific part. class VectorizationReport : public LoopAccessReport { @@ -1782,12 +1858,14 @@ private: Instruction *UnsafeAlgebraInst; }; -static void addInnerLoop(Loop &L, SmallVectorImpl &V) { - if (L.empty()) - return V.push_back(&L); - +static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl &V) { + if (L.empty()) { + if (!hasCyclesInLoopBody(L)) + V.push_back(&L); + return; + } for (Loop *InnerL : L) - addInnerLoop(*InnerL, V); + addAcyclicInnerLoop(*InnerL, V); } /// The LoopVectorize Pass. @@ -4395,6 +4473,9 @@ bool LoopVectorizationLegality::canVectorize() { return false; } + // FIXME: The code is currently dead, since the loop gets sent to + // LoopVectorizationLegality is already an innermost loop. + // // We can only vectorize innermost loops. if (!TheLoop->empty()) { emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); @@ -6639,7 +6720,7 @@ bool LoopVectorizePass::runImpl( SmallVector Worklist; for (Loop *L : *LI) - addInnerLoop(*L, Worklist); + addAcyclicInnerLoop(*L, Worklist); LoopsAnalyzed += Worklist.size(); diff --git a/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll new file mode 100644 index 0000000..a71b5e8 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -mtriple=aarch64-linux-eabi -o - | FileCheck %s + +%struct.a= type { i64, i64, i64, i64 } + +; DAG combine will try to perform a transformation that creates a vcvtfp2fxs +; with a v4f64 input. Since v4i64 is not legal we should bail out. We can +; pottentially still create the vcvtfp2fxs node after legalization (but on a +; v2f64). + +; CHECK-LABEL: fun1 +define void @fun1() local_unnamed_addr { +entry: + %mul = fmul <4 x double> zeroinitializer, + %toi = fptosi <4 x double> %mul to <4 x i64> + %ptr = getelementptr inbounds %struct.a, %struct.a* undef, i64 0, i32 2 + %elem = extractelement <4 x i64> %toi, i32 1 + store i64 %elem, i64* %ptr, align 8 + call void @llvm.trap() + unreachable +} + +; Function Attrs: noreturn nounwind +declare void @llvm.trap() + diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll index a12132f..d78c751 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll @@ -1,8 +1,246 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s -; RUN: opt -S -amdgpu-codegenprepare < %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s +; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s ; Make sure this doesn't crash with no triple -; CHECK-LABEL: @foo( -define void @foo() { +; NOOP-LABEL: @noop_fdiv_fpmath( +; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0 +define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out ret void } + +; CHECK-LABEL: @fdiv_fpmath( +; CHECK: %no.md = fdiv float %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 +; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 +; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { + %no.md = fdiv float %a, %b + store volatile float %no.md, float addrspace(1)* %out + + %md.half.ulp = fdiv float %a, %b, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %md.1ulp = fdiv float %a, %b, !fpmath !2 + store volatile float %md.1ulp, float addrspace(1)* %out + + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.3ulp = fdiv float %a, %b, !fpmath !3 + store volatile float %md.3ulp, float addrspace(1)* %out + + %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 + store volatile float %fast.md.25ulp, float addrspace(1)* %out + + %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 + store volatile float %arcp.md.25ulp, float addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath( +; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} +; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0 +; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 +; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}} +; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 +; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}} +; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0 +define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { + %no.md = fdiv float 1.0, %x + store volatile float %no.md, float addrspace(1)* %out + + %md.25ulp = fdiv float 1.0, %x, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.half.ulp = fdiv float 1.0, %x, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %arcp.no.md = fdiv arcp float 1.0, %x + store volatile float %arcp.no.md, float addrspace(1)* %out + + %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0 + store volatile float %arcp.25ulp, float addrspace(1)* %out + + %fast.no.md = fdiv fast float 1.0, %x + store volatile float %fast.no.md, float addrspace(1)* %out + + %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0 + store volatile float %fast.25ulp, float addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @fdiv_fpmath_vector( +; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2 + +; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0 +; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0 +; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0 +; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1 +define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { + %no.md = fdiv <2 x float> %a, %b + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1 + store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + + %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2 + store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out + + %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0 + store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath_vector( +; CHECK: %no.md = fdiv <2 x float> , %x{{$}} +; CHECK: %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 +; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x{{$}} +; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} + +; CHECK: extractelement <2 x float> %x +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: extractelement <2 x float> %x +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out +define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { + %no.md = fdiv <2 x float> , %x + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 + store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + + %arcp.no.md = fdiv arcp <2 x float> , %x + store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + + %fast.no.md = fdiv fast <2 x float> , %x + store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + + %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat( +; CHECK: %no.md = fdiv <2 x float> , %x +; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x +; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} + +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0 +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0 +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp +define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { + %no.md = fdiv <2 x float> , %x + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %arcp.no.md = fdiv arcp <2 x float> , %x + store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + + %fast.no.md = fdiv fast <2 x float> , %x + store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + + %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; FIXME: Should be able to get fdiv for 1.0 component +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp +define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { + %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 + + %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @fdiv_fpmath_f32_denormals( +; CHECK: %no.md = fdiv float %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 +; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 +; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { + %no.md = fdiv float %a, %b + store volatile float %no.md, float addrspace(1)* %out + + %md.half.ulp = fdiv float %a, %b, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %md.1ulp = fdiv float %a, %b, !fpmath !2 + store volatile float %md.1ulp, float addrspace(1)* %out + + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.3ulp = fdiv float %a, %b, !fpmath !3 + store volatile float %md.3ulp, float addrspace(1)* %out + + %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 + store volatile float %fast.md.25ulp, float addrspace(1)* %out + + %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 + store volatile float %arcp.md.25ulp, float addrspace(1)* %out + + ret void +} + +attributes #0 = { nounwind optnone noinline } +attributes #1 = { nounwind } +attributes #2 = { nounwind "target-features"="+fp32-denormals" } + +; CHECK: !0 = !{float 2.500000e+00} +; CHECK: !1 = !{float 5.000000e-01} +; CHECK: !2 = !{float 1.000000e+00} +; CHECK: !3 = !{float 3.000000e+00} + +!0 = !{float 2.500000e+00} +!1 = !{float 5.000000e-01} +!2 = !{float 1.000000e+00} +!3 = !{float 3.000000e+00} diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 7b51586..bd0817d 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -417,12 +417,6 @@ entry: ret void } -; HSAOPT: !0 = !{} -; HSAOPT: !1 = !{i32 0, i32 2048} - -; NOHSAOPT: !0 = !{i32 0, i32 2048} - - ; FUNC-LABEL: v16i32_stack: ; R600: MOVA_INT @@ -527,4 +521,33 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) { ret void } +; OPT-LABEL: @direct_alloca_read_0xi32( +; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)* +; OPT: load [0 x i32], [0 x i32] addrspace(3)* +define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [0 x i32] + store [0 x i32] [], [0 x i32]* %tmp + %load = load [0 x i32], [0 x i32]* %tmp + store [0 x i32] %load, [0 x i32] addrspace(1)* %out + ret void +} + +; OPT-LABEL: @direct_alloca_read_1xi32( +; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)* +; OPT: load [1 x i32], [1 x i32] addrspace(3)* +define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [1 x i32] + store [1 x i32] [i32 0], [1 x i32]* %tmp + %load = load [1 x i32], [1 x i32]* %tmp + store [1 x i32] %load, [1 x i32] addrspace(1)* %out + ret void +} + attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" } + +; HSAOPT: !0 = !{} +; HSAOPT: !1 = !{i32 0, i32 2048} + +; NOHSAOPT: !0 = !{i32 0, i32 2048} diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll index ff730a0..0063624 100644 --- a/test/CodeGen/AMDGPU/basic-branch.ll +++ b/test/CodeGen/AMDGPU/basic-branch.ll @@ -6,7 +6,6 @@ ; GCN-LABEL: {{^}}test_branch: ; GCNNOOPT: v_writelane_b32 ; GCNNOOPT: v_writelane_b32 -; GCNNOOPT: v_writelane_b32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] ; GCN: ; BB#1 diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index 4021233..65464cd 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -1,8 +1,4 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; These tests check that fdiv is expanded correctly and also test that the @@ -15,22 +11,59 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 +; SI: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 ; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b + store float %fdiv, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fdiv_25ulp_f32: +; SI: v_cndmask_b32 +; SI: v_mul_f32 +; SI: v_rcp_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} + +; Use correct fdiv +; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +entry: + %fdiv = fdiv float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} -; I754-DAG: v_div_scale_f32 -; I754-DAG: v_rcp_f32 -; I754-DAG: v_fma_f32 -; I754-DAG: v_mul_f32 -; I754-DAG: v_fma_f32 -; I754-DAG: v_div_fixup_f32 -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: - %0 = fdiv float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv fast float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -38,15 +71,14 @@ entry: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) { +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: - %0 = fdiv fast float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv fast float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -54,15 +86,14 @@ entry: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) { +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: - %0 = fdiv arcp float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv arcp float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -72,26 +103,24 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +entry: + %fdiv = fdiv <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out + ret void +} -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: +; SI: v_cmp_gt_f32 +; SI: v_cmp_gt_f32 +define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -101,19 +130,12 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv fast <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv fast <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -123,19 +145,12 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv arcp <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv arcp <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -149,37 +164,11 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 - -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad store <4 x float> %result, <4 x float> addrspace(1)* %out ret void } + +attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" } +attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" } +attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" } + +!0 = !{float 2.500000e+00} diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll index be23e10..1537d67 100644 --- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll +++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -1,7 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare double @llvm.fabs.f64(double) #1 ; FUNC-LABEL: @fp_to_sint_f64_i32 ; SI: v_cvt_i32_f64_e32 @@ -54,3 +55,23 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in store i64 %cast, i64 addrspace(1)* %out, align 8 ret void } + +; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1: +; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}} +define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { + %conv = fptosi double %in to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1: +; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}| +define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { + %in.fabs = call double @llvm.fabs.f64(double %in) + %conv = fptosi double %in.fabs to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll index b39aead..0cd0358 100644 --- a/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC -declare float @llvm.fabs.f32(float) #0 +declare float @llvm.fabs.f32(float) #1 ; FUNC-LABEL: {{^}}fp_to_sint_i32: ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} @@ -17,7 +17,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { - %in.fabs = call float @llvm.fabs.f32(float %in) #0 + %in.fabs = call float @llvm.fabs.f32(float %in) %conv = fptosi float %in.fabs to i32 store i32 %conv, i32 addrspace(1)* %out ret void @@ -227,4 +227,26 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { ret void } -attributes #0 = { nounwind readnone } +; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1: +; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}} + +; EG: AND_INT +; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y, +; EG-NEXT: -1082130432(-1.000000e+00) +define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { + %conv = fptosi float %in to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: +; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}| +define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { + %in.fabs = call float @llvm.fabs.f32(float %in) + %conv = fptosi float %in.fabs to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll index 760019e..d5bc416 100644 --- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll +++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -1,7 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare double @llvm.fabs.f64(double) #1 ; SI-LABEL: {{^}}fp_to_uint_i32_f64: ; SI: v_cvt_u32_f64_e32 @@ -68,3 +69,23 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> % store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 ret void } + +; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1: +; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}} +define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { + %conv = fptoui double %in to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1: +; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}| +define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 { + %in.fabs = call double @llvm.fabs.f64(double %in) + %conv = fptoui double %in.fabs to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll index b7b6ccc..8a0f9fa 100644 --- a/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC + +declare float @llvm.fabs.f32(float) #1 ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32: ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} @@ -215,3 +217,27 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> store <4 x i64> %conv, <4 x i64> addrspace(1)* %out ret void } + + +; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1: +; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}} + +; EG: AND_INT +; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0, +define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { + %conv = fptoui float %in to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1: +; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}| +define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 { + %in.fabs = call float @llvm.fabs.f32(float %in) + %conv = fptoui float %in.fabs to i1 + store i1 %conv, i1 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll new file mode 100644 index 0000000..4e17a92 --- /dev/null +++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll @@ -0,0 +1,8 @@ +; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s +; check llc does not crash for invalid opencl version metadata + +; CHECK: .section .AMDGPU.runtime_metadata +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .short 256 + +!opencl.ocl.version = !{} diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll new file mode 100644 index 0000000..35b7d70 --- /dev/null +++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll @@ -0,0 +1,9 @@ +; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s +; check llc does not crash for invalid opencl version metadata + +; CHECK: .section .AMDGPU.runtime_metadata +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .short 256 + +!opencl.ocl.version = !{!0} +!0 = !{} diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll new file mode 100644 index 0000000..e169355 --- /dev/null +++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll @@ -0,0 +1,9 @@ +; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s +; check llc does not crash for invalid opencl version metadata + +; CHECK: .section .AMDGPU.runtime_metadata +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .short 256 + +!opencl.ocl.version = !{!0} +!0 = !{i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll new file mode 100644 index 0000000..54d7848 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.amdgcn.fdiv.fast(float, float) #0 + +; CHECK-LABEL: {{^}}test_fdiv_fast: +; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc +; CHECK: v_mul_f32_e32 +; CHECK: v_rcp_f32_e32 +; CHECK: v_mul_f32_e32 +; CHECK: v_mul_f32_e32 +define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { + %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) + store float %fdiv, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll deleted file mode 100644 index cf6d1ab..0000000 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s - - -@lds0 = addrspace(3) global [512 x float] undef, align 4 -@lds1 = addrspace(3) global [256 x float] undef, align 4 - -; FUNC-LABEL: {{^}}groupstaticsize_test0: -; CHECK: s_movk_i32 s{{[0-9]+}}, 0x800 -define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %idx.0 = add nsw i32 %tid.x, 64 - %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 - store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - store float %val0, float addrspace(1)* %out, align 4 - - ret void -} - - -; FUNC-LABEL: {{^}}groupstaticsize_test1: -; CHECK: s_movk_i32 s{{[0-9]+}}, 0xc00 -define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { -entry: - %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 - store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 - %idx.0 = add nsw i32 %tid.x, 64 - %tmp = icmp eq i32 %cond, 0 - br i1 %tmp, label %if, label %else - -if: ; preds = %entry - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - store float %val0, float addrspace(1)* %out, align 4 - br label %endif - -else: ; preds = %entry - %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - store float %val1, float addrspace(1)* %out, align 4 - br label %endif - -endif: ; preds = %else, %if - ret void -} - - -declare i32 @llvm.amdgcn.groupstaticsize() #1 -declare i32 @llvm.amdgcn.workitem.id.x() #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll new file mode 100644 index 0000000..6014e2e --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -0,0 +1,66 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s + +@lds0 = addrspace(3) global [512 x float] undef, align 4 +@lds1 = addrspace(3) global [256 x float] undef, align 4 + +@large = addrspace(3) global [4096 x i32] undef, align 4 + +; CHECK-LABEL: {{^}}groupstaticsize_test0: +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} +define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 64 + %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 + store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + store float %val0, float addrspace(1)* %out, align 4 + + ret void +} + +; CHECK-LABEL: {{^}}groupstaticsize_test1: +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} +define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { +entry: + %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 + store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4 + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 64 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + store float %val0, float addrspace(1)* %out, align 4 + br label %endif + +else: ; preds = %entry + %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + store float %val1, float addrspace(1)* %out, align 4 + br label %endif + +endif: ; preds = %else, %if + ret void +} + +; Exceeds 16-bit simm limit of s_movk_i32 +; CHECK-LABEL: {{^}}large_groupstaticsize: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} +define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { + %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx + store volatile i32 0, i32 addrspace(3)* %gep + %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() + store i32 %static_lds_size, i32 addrspace(1)* %size + ret void +} + +declare i32 @llvm.amdgcn.groupstaticsize() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll index b1d4220..27a88f7 100644 --- a/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -1,11 +1,96 @@ -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: Evergreen only ever does unsafe fp math. ; FUNC-LABEL: {{^}}rcp_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + ; EG: RECIP_IEEE -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv fast float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv arcp float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fabs_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]| +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %rcp = fdiv float 1.0, %src.fabs + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FIXME: fneg folded into constant 1 +; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32: +define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %src.fabs.fneg = fsub float -0.0, %src.fabs + %rcp = fdiv float 1.0, %src.fabs.fneg + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + + +declare float @llvm.fabs.f32(float) #1 + +attributes #0 = { nounwind "unsafe-fp-math"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "unsafe-fp-math"="true" } + +!0 = !{float 2.500000e+00} diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll deleted file mode 100644 index f9292a7..0000000 --- a/test/CodeGen/AMDGPU/reciprocal.ll +++ /dev/null @@ -1,13 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define amdgpu_ps void @test(<4 x float> inreg %reg0) { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = fdiv float 1.0, %r0 - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll index 10187f6..4ba4ac7 100644 --- a/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -348,7 +348,6 @@ bb7: ; preds = %bb4 ; CHECK: image_sample_c ; CHECK: v_cmp_neq_f32_e32 vcc, 0, -; CHECK: s_and_b64 exec, exec, ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] @@ -385,6 +384,7 @@ bb9: ; preds = %bb4 declare void @llvm.AMDGPU.kill(float) #0 declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } attributes #1 = { nounwind readnone } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll index c151ca9..7dcf36f 100644 --- a/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/test/CodeGen/AMDGPU/vector-alloca.ll @@ -3,6 +3,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s + +; OPT-LABEL: @vector_read( +; OPT: %0 = extractelement <4 x i32> , i32 %index +; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 ; FUNC-LABEL: {{^}}vector_read: ; EG: MOV @@ -12,21 +17,26 @@ ; EG: MOVA_INT define void @vector_read(i32 addrspace(1)* %out, i32 %index) { entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + %tmp = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3 store i32 0, i32* %x store i32 1, i32* %y store i32 2, i32* %z store i32 3, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index - %2 = load i32, i32* %1 - store i32 %2, i32 addrspace(1)* %out + %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index + %tmp2 = load i32, i32* %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out ret void } +; OPT-LABEL: @vector_write( +; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index +; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index +; OPT: store i32 %1, i32 addrspace(1)* %out, align 4 + ; FUNC-LABEL: {{^}}vector_write: ; EG: MOV ; EG: MOV @@ -36,42 +46,95 @@ entry: ; EG: MOVA_INT define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + %tmp = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3 store i32 0, i32* %x store i32 0, i32* %y store i32 0, i32* %z store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index - store i32 1, i32* %1 - %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index - %3 = load i32, i32* %2 - store i32 %3, i32 addrspace(1)* %out + %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index + store i32 1, i32* %tmp1 + %tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index + %tmp3 = load i32, i32* %tmp2 + store i32 %tmp3, i32 addrspace(1)* %out ret void } ; This test should be optimize to: ; store i32 0, i32 addrspace(1)* %out + +; OPT-LABEL: @bitcast_gep( +; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4 + ; FUNC-LABEL: {{^}}bitcast_gep: ; EG: STORE_RAW define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + %tmp = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3 store i32 0, i32* %x store i32 0, i32* %y store i32 0, i32* %z store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %2 = bitcast i32* %1 to [4 x i32]* - %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0 - %4 = load i32, i32* %3 - store i32 %4, i32 addrspace(1)* %out + %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %tmp2 = bitcast i32* %tmp1 to [4 x i32]* + %tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0 + %tmp4 = load i32, i32* %tmp3 + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @vector_read_bitcast_gep( +; OPT: %0 = extractelement <4 x i32> , i32 %index +; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 +define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [4 x i32] + %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 + %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2 + %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3 + %bc = bitcast i32* %x to float* + store float 1.0, float* %bc + store i32 1, i32* %y + store i32 2, i32* %z + store i32 3, i32* %w + %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index + %tmp2 = load i32, i32* %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should be able to promote this. Instcombine should fold the +; cast in the hasOneUse case so it might not matter in practice + +; OPT-LABEL: @vector_read_bitcast_alloca( +; OPT: alloca [4 x float] +; OPT: store float +; OPT: store float +; OPT: store float +; OPT: store float +; OPT: load float +define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [4 x i32] + %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]* + %x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0 + %y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1 + %z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2 + %w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3 + store float 0.0, float* %x + store float 1.0, float* %y + store float 2.0, float* %z + store float 4.0, float* %w + %tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index + %tmp2 = load float, float* %tmp1 + store float %tmp2, float addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 23b0ffd..809a7ba 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -41,14 +41,14 @@ main_body: ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2 - %wr = extractelement <4 x float> %tex, i32 1 - store float %wr, float addrspace(1)* %gep + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0) + ret <4 x float> %tex } @@ -66,8 +66,9 @@ main_body: define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 - store float %data, float addrspace(1)* %gep + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) ret <4 x float> %tex } @@ -89,7 +90,7 @@ main_body: ;CHECK: s_mov_b64 exec, [[SAVED]] ;CHECK: %IF ;CHECK: image_sample -define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -100,8 +101,7 @@ IF: br label %END ELSE: - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) br label %END END: @@ -129,7 +129,7 @@ END: ;CHECK: s_or_b64 exec, exec, ;CHECK: v_mov_b32_e32 v0 ;CHECK: ; return -define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %ELSE, label %IF @@ -140,8 +140,7 @@ IF: br label %END ELSE: - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) br label %END END: @@ -163,23 +162,20 @@ END: ;CHECK: store ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmp -define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 0 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) ; The load that determines the branch (and should therefore be WQM) is ; surrounded by stores that require disabled WQM. %idx.2 = extractelement <3 x i32> %idx, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 - %z = load float, float addrspace(1)* %gep.2 + %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 %data.3 = extractelement <2 x float> %data, i32 1 - store float %data.3, float addrspace(1)* %gep.3 + call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -210,24 +206,21 @@ END: ;CHECK: load ;CHECK: store ;CHECK: v_cmp -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = extractelement <4 x float> %tex, i32 0 %idx.1 = extractelement <3 x i32> %idx, i32 0 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 0 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) %idx.2 = extractelement <3 x i32> %idx, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 - %z = load float, float addrspace(1)* %gep.2 + %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) %idx.3 = extractelement <3 x i32> %idx, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 %data.3 = extractelement <2 x float> %data, i32 1 - store float %data.3, float addrspace(1)* %gep.3 + call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) %cc = fcmp ogt float %z, 0.0 br i1 %cc, label %IF, label %ELSE @@ -258,15 +251,14 @@ END: ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: %END ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END IF: - %data = load float, float addrspace(1)* %ptr - %gep = getelementptr float, float addrspace(1)* %ptr, i32 1 - store float %data, float addrspace(1)* %gep + %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) br label %END END: @@ -282,13 +274,11 @@ END: ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] -;SI: buffer_store_dword -;VI: flat_store_dword +;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmpx_ ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;SI: buffer_store_dword -;VI: flat_store_dword +;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { @@ -296,16 +286,14 @@ main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %idx.0 = extractelement <2 x i32> %idx, i32 0 - %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0 %data.0 = extractelement <2 x float> %data, i32 0 - store float %data.0, float addrspace(1)* %gep.0 + call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) call void @llvm.AMDGPU.kill(float %z) %idx.1 = extractelement <2 x i32> %idx, i32 1 - %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 %data.1 = extractelement <2 x float> %data, i32 1 - store float %data.1, float addrspace(1)* %gep.1 + call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %out = fadd <4 x float> %tex, %tex2 @@ -321,16 +309,14 @@ main_body: ; CHECK: s_wqm_b64 exec, exec ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] -; SI: buffer_store_dword -; VI: flat_store_dword +; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx - store float %data, float addrspace(1)* %gep + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) call void @llvm.AMDGPU.kill(float %z) @@ -350,9 +336,91 @@ main_body: ret float %s } +; CHECK-LABEL: {{^}}test_loop_vcc: +; CHECK-NEXT: ; %entry +; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: image_store +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2 +; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]] + +; CHECK: [[LOOPHDR]]: ; %loop +; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]] +; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]] +; CHECK: s_cbranch_vccz +; CHECK: ; %break + +; CHECK: ; return +define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { +entry: + call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + br label %loop + +loop: + %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ] + %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] + %cc = icmp sgt i32 %ctr.iv, 7 + br i1 %cc, label %break, label %body + +body: + %c.i = bitcast <4 x float> %c.iv to <4 x i32> + %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %ctr.next = add i32 %ctr.iv, 2 + br label %loop + +break: + ret <4 x float> %c.iv +} + +; Only intrinsic stores need exact execution -- other stores do not have +; externally visible effects and may require WQM for correctness. +; +; CHECK-LABEL: {{^}}test_alloca: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec + +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 +; CHECK: s_wqm_b64 exec, exec +; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen +; CHECK: s_wqm_b64 exec, exec +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen + +; CHECK: image_sample +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: buffer_store_dwordx4 +define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { +entry: + %array = alloca [32 x i32], align 4 + + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + + %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0 + store volatile i32 %a, i32* %s.gep, align 4 + + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) + + %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx + %c = load i32, i32* %c.gep, align 4 + + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) + + ret void +} + + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll index 151cc1b..04eae8f 100644 --- a/test/CodeGen/ARM/arm-and-tst-peephole.ll +++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll @@ -49,7 +49,7 @@ tailrecurse.switch: ; preds = %tailrecurse ; V8-NEXT: beq ; V8-NEXT: %tailrecurse.switch ; V8: cmp -; V8-NEXT: beq +; V8-NEXT: bne ; V8-NEXT: b ; The trailing space in the last line checks that the branch is unconditional switch i32 %and, label %sw.epilog [ diff --git a/test/CodeGen/ARM/ssat-v4t.ll b/test/CodeGen/ARM/ssat-v4t.ll new file mode 100644 index 0000000..3d74c88 --- /dev/null +++ b/test/CodeGen/ARM/ssat-v4t.ll @@ -0,0 +1,9 @@ +; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s + +; CHECK: Cannot select: intrinsic %llvm.arm.ssat +define i32 @ssat() nounwind { + %tmp = call i32 @llvm.arm.ssat(i32 128, i32 1) + ret i32 %tmp +} + +declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone diff --git a/test/CodeGen/ARM/ssat.ll b/test/CodeGen/ARM/ssat.ll index 2b75bc4..f1e11dd 100644 --- a/test/CodeGen/ARM/ssat.ll +++ b/test/CodeGen/ARM/ssat.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s +; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T +; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2 ; Check for several conditions that should result in SSAT. ; For example, the base test is equivalent to @@ -16,7 +17,8 @@ ; 32-bit base test define i32 @sat_base_32bit(i32 %x) #0 { ; CHECK-LABEL: sat_base_32bit: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpLow = icmp slt i32 %x, -8388608 %cmpUp = icmp sgt i32 %x, 8388607 @@ -29,7 +31,8 @@ entry: ; 16-bit base test define i16 @sat_base_16bit(i16 %x) #0 { ; CHECK-LABEL: sat_base_16bit: -; CHECK: ssat r0, #12, r0 +; V6T2: ssat r0, #12, r0 +; V4T-NOT: ssat entry: %cmpLow = icmp slt i16 %x, -2048 %cmpUp = icmp sgt i16 %x, 2047 @@ -42,7 +45,8 @@ entry: ; 8-bit base test define i8 @sat_base_8bit(i8 %x) #0 { ; CHECK-LABEL: sat_base_8bit: -; CHECK: ssat r0, #6, r0 +; V6T2: ssat r0, #6, r0 +; V4T-NOT: ssat entry: %cmpLow = icmp slt i8 %x, -32 %cmpUp = icmp sgt i8 %x, 31 @@ -60,7 +64,8 @@ entry: ; x < -k ? -k : (x < k ? x : k) define i32 @sat_lower_upper_1(i32 %x) #0 { ; CHECK-LABEL: sat_lower_upper_1: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpLow = icmp slt i32 %x, -8388608 %cmpUp = icmp slt i32 %x, 8388607 @@ -72,7 +77,8 @@ entry: ; x > -k ? (x > k ? k : x) : -k define i32 @sat_lower_upper_2(i32 %x) #0 { ; CHECK-LABEL: sat_lower_upper_2: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpLow = icmp sgt i32 %x, -8388608 %cmpUp = icmp sgt i32 %x, 8388607 @@ -84,7 +90,8 @@ entry: ; x < k ? (x < -k ? -k : x) : k define i32 @sat_upper_lower_1(i32 %x) #0 { ; CHECK-LABEL: sat_upper_lower_1: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpUp = icmp slt i32 %x, 8388607 %cmpLow = icmp slt i32 %x, -8388608 @@ -96,7 +103,8 @@ entry: ; x > k ? k : (x < -k ? -k : x) define i32 @sat_upper_lower_2(i32 %x) #0 { ; CHECK-LABEL: sat_upper_lower_2: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpUp = icmp sgt i32 %x, 8388607 %cmpLow = icmp slt i32 %x, -8388608 @@ -108,7 +116,8 @@ entry: ; k < x ? k : (x > -k ? x : -k) define i32 @sat_upper_lower_3(i32 %x) #0 { ; CHECK-LABEL: sat_upper_lower_3: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpUp = icmp slt i32 8388607, %x %cmpLow = icmp sgt i32 %x, -8388608 @@ -125,7 +134,8 @@ entry: ; k <= x ? k : (x >= -k ? x : -k) define i32 @sat_le_ge(i32 %x) #0 { ; CHECK-LABEL: sat_le_ge: -; CHECK: ssat r0, #24, r0 +; V6T2: ssat r0, #24, r0 +; V4T-NOT: ssat entry: %cmpUp = icmp sle i32 8388607, %x %cmpLow = icmp sge i32 %x, -8388608 diff --git a/test/CodeGen/ARM/usat-v4t.ll b/test/CodeGen/ARM/usat-v4t.ll new file mode 100644 index 0000000..572c760 --- /dev/null +++ b/test/CodeGen/ARM/usat-v4t.ll @@ -0,0 +1,9 @@ +; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.usat +define i32 @usat1() nounwind { + %tmp = call i32 @llvm.arm.usat(i32 128, i32 31) + ret i32 %tmp +} + +declare i32 @llvm.arm.usat(i32, i32) nounwind readnone diff --git a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll index f736ddd..c0229c6 100644 --- a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll +++ b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll @@ -11,13 +11,13 @@ entry: ; PIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]]) ; STATIC-O32: lui $[[R0:[0-9]+]], %hi($CPI0_0) ; STATIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]]) -; PIC-N32: lw $[[R0:[0-9]+]], %got_page($CPI0_0) -; PIC-N32: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]]) -; STATIC-N32: lui $[[R0:[0-9]+]], %hi($CPI0_0) -; STATIC-N32: lwc1 $f0, %lo($CPI0_0)($[[R0]]) -; PIC-N64: ld $[[R0:[0-9]+]], %got_page($CPI0_0) -; PIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]]) -; STATIC-N64: ld $[[R0:[0-9]+]], %got_page($CPI0_0) -; STATIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]]) +; PIC-N32: lw $[[R0:[0-9]+]], %got_page(.LCPI0_0) +; PIC-N32: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]]) +; STATIC-N32: lui $[[R0:[0-9]+]], %hi(.LCPI0_0) +; STATIC-N32: lwc1 $f0, %lo(.LCPI0_0)($[[R0]]) +; PIC-N64: ld $[[R0:[0-9]+]], %got_page(.LCPI0_0) +; PIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]]) +; STATIC-N64: ld $[[R0:[0-9]+]], %got_page(.LCPI0_0) +; STATIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]]) ret float 0x400B333340000000 } diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll index 7d66d1a..5f0a0a5 100644 --- a/test/CodeGen/Mips/2010-07-20-Switch.ll +++ b/test/CodeGen/Mips/2010-07-20-Switch.ll @@ -27,9 +27,9 @@ entry: ; PIC-O32: addu $[[R5:[0-9]+]], $[[R4:[0-9]+]] ; PIC-O32: jr $[[R5]] ; N64: dsll $[[R0:[0-9]+]], ${{[0-9]+}}, 3 -; N64: ld $[[R1:[0-9]+]], %got_page($JTI0_0) +; N64: ld $[[R1:[0-9]+]], %got_page(.LJTI0_0) ; N64: daddu $[[R2:[0-9]+]], $[[R0:[0-9]+]], $[[R1]] -; N64: ld $[[R4:[0-9]+]], %got_ofst($JTI0_0)($[[R2]]) +; N64: ld $[[R4:[0-9]+]], %got_ofst(.LJTI0_0)($[[R2]]) ; N64: daddu $[[R5:[0-9]+]], $[[R4:[0-9]+]] ; N64: jr $[[R5]] switch i32 %0, label %bb4 [ @@ -68,7 +68,7 @@ bb5: ; preds = %entry ; PIC-O32: .gpword ; PIC-O32: .gpword ; N64: .p2align 3 -; N64: $JTI0_0: +; N64: .LJTI0_0: ; N64: .gpdword ; N64: .gpdword ; N64: .gpdword diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll index 377fe93..6215087 100644 --- a/test/CodeGen/Mips/analyzebranch.ll +++ b/test/CodeGen/Mips/analyzebranch.ll @@ -10,7 +10,7 @@ define double @foo(double %a, double %b) nounwind readnone { entry: ; ALL-LABEL: foo: -; FCC: bc1f $BB +; FCC: bc1f {{\$|\.L}}BB ; FCC: nop ; 32-GPR: mtc1 $zero, $[[Z:f[0-9]]] @@ -19,7 +19,7 @@ entry: ; GPR: cmp.lt.d $[[FGRCC:f[0-9]+]], $[[Z]], $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] -; GPR: bnezc $[[GPRCC]], $BB +; GPR: bnezc $[[GPRCC]], {{\$|\.L}}BB %cmp = fcmp ogt double %a, 0.000000e+00 br i1 %cmp, label %if.end6, label %if.else @@ -43,7 +43,7 @@ define void @f1(float %f) nounwind { entry: ; ALL-LABEL: f1: -; FCC: bc1f $BB +; FCC: bc1f {{\$|\.L}}BB ; FCC: nop ; GPR: mtc1 $zero, $[[Z:f[0-9]]] diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll index 8f4ccb1..dfba8ba 100644 --- a/test/CodeGen/Mips/atomic.ll +++ b/test/CodeGen/Mips/atomic.ll @@ -34,17 +34,17 @@ entry: ; MIPS32-ANY: lw $[[R0:[0-9]+]], %got(x) ; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x)( -; O0: $[[BB0:[A-Z_0-9]+]]: +; O0: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; O0: ld $[[R1:[0-9]+]] ; O0-NEXT: ll $[[R2:[0-9]+]], 0($[[R1]]) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R3:[0-9]+]], 0($[[R0]]) ; ALL: addu $[[R4:[0-9]+]], $[[R3]], $4 ; ALL: sc $[[R4]], 0($[[R0]]) -; NOT-MICROMIPS: beqz $[[R4]], $[[BB0]] -; MICROMIPS: beqzc $[[R4]], $[[BB0]] -; MIPSR6: beqzc $[[R4]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R4]], [[BB0]] +; MICROMIPS: beqzc $[[R4]], [[BB0]] +; MIPSR6: beqzc $[[R4]], [[BB0]] } define i32 @AtomicLoadNand32(i32 signext %incr) nounwind { @@ -59,14 +59,14 @@ entry: -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R1:[0-9]+]], 0($[[R0]]) ; ALL: and $[[R3:[0-9]+]], $[[R1]], $4 ; ALL: nor $[[R2:[0-9]+]], $zero, $[[R3]] ; ALL: sc $[[R2]], 0($[[R0]]) -; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]] -; MICROMIPS: beqzc $[[R2]], $[[BB0]] -; MIPSR6: beqzc $[[R2]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R2]], [[BB0]] +; MICROMIPS: beqzc $[[R2]], [[BB0]] +; MIPSR6: beqzc $[[R2]], [[BB0]] } define i32 @AtomicSwap32(i32 signext %newval) nounwind { @@ -82,12 +82,12 @@ entry: ; MIPS32-ANY: lw $[[R0:[0-9]+]], %got(x) ; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll ${{[0-9]+}}, 0($[[R0]]) ; ALL: sc $[[R2:[0-9]+]], 0($[[R0]]) -; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]] -; MICROMIPS: beqzc $[[R2]], $[[BB0]] -; MIPSR6: beqzc $[[R2]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R2]], [[BB0]] +; MICROMIPS: beqzc $[[R2]], [[BB0]] +; MIPSR6: beqzc $[[R2]], [[BB0]] } define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind { @@ -104,16 +104,16 @@ entry: ; MIPS32-ANY: lw $[[R0:[0-9]+]], %got(x) ; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x)( -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $2, 0($[[R0]]) -; NOT-MICROMIPS: bne $2, $4, $[[BB1:[A-Z_0-9]+]] -; MICROMIPS: bne $2, $4, $[[BB1:[A-Z_0-9]+]] -; MIPSR6: bnec $2, $4, $[[BB1:[A-Z_0-9]+]] +; NOT-MICROMIPS: bne $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MICROMIPS: bne $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MIPSR6: bnec $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]] ; ALL: sc $[[R2:[0-9]+]], 0($[[R0]]) -; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]] -; MICROMIPS: beqzc $[[R2]], $[[BB0]] -; MIPSR6: beqzc $[[R2]], $[[BB0]] -; ALL: $[[BB1]]: +; NOT-MICROMIPS: beqz $[[R2]], [[BB0]] +; MICROMIPS: beqzc $[[R2]], [[BB0]] +; MIPSR6: beqzc $[[R2]], [[BB0]] +; ALL: [[BB1]]: } @@ -141,20 +141,20 @@ entry: ; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] ; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]] -; O0: $[[BB0:[A-Z_0-9]+]]: +; O0: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; O0: ld $[[R10:[0-9]+]] ; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]]) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R12:[0-9]+]], 0($[[R2]]) ; ALL: addu $[[R13:[0-9]+]], $[[R12]], $[[R9]] ; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] ; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]] ; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]] ; ALL: sc $[[R16]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]] -; MICROMIPS: beqzc $[[R16]], $[[BB0]] -; MIPSR6: beqzc $[[R16]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R16]], [[BB0]] +; MICROMIPS: beqzc $[[R16]], [[BB0]] +; MIPSR6: beqzc $[[R16]], [[BB0]] ; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]] ; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]] @@ -186,20 +186,20 @@ entry: ; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] ; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]] -; O0: $[[BB0:[A-Z_0-9]+]]: +; O0: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; O0: ld $[[R10:[0-9]+]] ; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]]) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R12:[0-9]+]], 0($[[R2]]) ; ALL: subu $[[R13:[0-9]+]], $[[R12]], $[[R9]] ; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] ; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]] ; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]] ; ALL: sc $[[R16]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]] -; MICROMIPS: beqzc $[[R16]], $[[BB0]] -; MIPSR6: beqzc $[[R16]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R16]], [[BB0]] +; MICROMIPS: beqzc $[[R16]], [[BB0]] +; MIPSR6: beqzc $[[R16]], [[BB0]] ; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]] ; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]] @@ -231,11 +231,11 @@ entry: ; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] ; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]] -; O0: $[[BB0:[A-Z_0-9]+]]: +; O0: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; O0: ld $[[R10:[0-9]+]] ; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]]) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R12:[0-9]+]], 0($[[R2]]) ; ALL: and $[[R13:[0-9]+]], $[[R12]], $[[R9]] ; ALL: nor $[[R14:[0-9]+]], $zero, $[[R13]] @@ -243,9 +243,9 @@ entry: ; ALL: and $[[R16:[0-9]+]], $[[R12]], $[[R8]] ; ALL: or $[[R17:[0-9]+]], $[[R16]], $[[R15]] ; ALL: sc $[[R17]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R17]], $[[BB0]] -; MICROMIPS: beqzc $[[R17]], $[[BB0]] -; MIPSR6: beqzc $[[R17]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R17]], [[BB0]] +; MICROMIPS: beqzc $[[R17]], [[BB0]] +; MIPSR6: beqzc $[[R17]], [[BB0]] ; ALL: and $[[R18:[0-9]+]], $[[R12]], $[[R7]] ; ALL: srlv $[[R19:[0-9]+]], $[[R18]], $[[R5]] @@ -277,15 +277,15 @@ entry: ; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] ; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]] -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R10:[0-9]+]], 0($[[R2]]) ; ALL: and $[[R18:[0-9]+]], $[[R9]], $[[R7]] ; ALL: and $[[R13:[0-9]+]], $[[R10]], $[[R8]] ; ALL: or $[[R14:[0-9]+]], $[[R13]], $[[R18]] ; ALL: sc $[[R14]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]] -; MICROMIPS: beqzc $[[R14]], $[[BB0]] -; MIPSR6: beqzc $[[R14]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R14]], [[BB0]] +; MICROMIPS: beqzc $[[R14]], [[BB0]] +; MIPSR6: beqzc $[[R14]], [[BB0]] ; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]] ; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]] @@ -322,21 +322,21 @@ entry: ; ALL: andi $[[R11:[0-9]+]], $5, 255 ; ALL: sllv $[[R12:[0-9]+]], $[[R11]], $[[R5]] -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R13:[0-9]+]], 0($[[R2]]) ; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] -; NOT-MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] -; MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] -; MIPSR6: bnec $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] +; NOT-MICROMIPS: bne $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MICROMIPS: bne $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MIPSR6: bnec $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] ; ALL: and $[[R15:[0-9]+]], $[[R13]], $[[R8]] ; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R12]] ; ALL: sc $[[R16]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]] -; MICROMIPS: beqzc $[[R16]], $[[BB0]] -; MIPSR6: beqzc $[[R16]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R16]], [[BB0]] +; MICROMIPS: beqzc $[[R16]], [[BB0]] +; MIPSR6: beqzc $[[R16]], [[BB0]] -; ALL: $[[BB1]]: +; ALL: [[BB1]]: ; ALL: srlv $[[R17:[0-9]+]], $[[R14]], $[[R5]] ; NO-SEB-SEH: sll $[[R18:[0-9]+]], $[[R17]], 24 @@ -366,21 +366,21 @@ entry: ; ALL: andi $[[R11:[0-9]+]], $6, 255 ; ALL: sllv $[[R12:[0-9]+]], $[[R11]], $[[R5]] -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R13:[0-9]+]], 0($[[R2]]) ; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] -; NOT-MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] -; MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] -; MIPSR6: bnec $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]] +; NOT-MICROMIPS: bne $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MICROMIPS: bne $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] +; MIPSR6: bnec $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]] ; ALL: and $[[R15:[0-9]+]], $[[R13]], $[[R8]] ; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R12]] ; ALL: sc $[[R16]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]] -; MICROMIPS: beqzc $[[R16]], $[[BB0]] -; MIPSR6: beqzc $[[R16]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R16]], [[BB0]] +; MICROMIPS: beqzc $[[R16]], [[BB0]] +; MIPSR6: beqzc $[[R16]], [[BB0]] -; ALL: $[[BB1]]: +; ALL: [[BB1]]: ; ALL: srlv $[[R17:[0-9]+]], $[[R14]], $[[R5]] ; NO-SEB-SEH: sll $[[R18:[0-9]+]], $[[R17]], 24 @@ -423,20 +423,20 @@ entry: ; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]] ; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]] -; O0: $[[BB0:[A-Z_0-9]+]]: +; O0: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; O0: ld $[[R10:[0-9]+]] ; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]]) -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R12:[0-9]+]], 0($[[R2]]) ; ALL: addu $[[R13:[0-9]+]], $[[R12]], $[[R9]] ; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]] ; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]] ; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]] ; ALL: sc $[[R16]], 0($[[R2]]) -; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]] -; MICROMIPS: beqzc $[[R16]], $[[BB0]] -; MIPSR6: beqzc $[[R16]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R16]], [[BB0]] +; MICROMIPS: beqzc $[[R16]], [[BB0]] +; MIPSR6: beqzc $[[R16]], [[BB0]] ; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]] ; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]] @@ -465,15 +465,15 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) { ; ALL: sync ; ALL: andi $[[R3:[0-9]+]], $[[R2]], 65535 -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R4:[0-9]+]], 0($[[R5:[0-9]+]]) ; ALL: and $[[R6:[0-9]+]], $[[R4]], $ ; ALL: and $[[R7:[0-9]+]], $[[R4]], $ ; ALL: or $[[R8:[0-9]+]], $[[R7]], $ ; ALL: sc $[[R8]], 0($[[R5]]) -; NOT-MICROMIPS: beqz $[[R8]], $[[BB0]] -; MICROMIPS: beqzc $[[R8]], $[[BB0]] -; MIPSR6: beqzc $[[R8]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R8]], [[BB0]] +; MICROMIPS: beqzc $[[R8]], [[BB0]] +; MIPSR6: beqzc $[[R8]], [[BB0]] ; ALL: srlv $[[R9:[0-9]+]], $[[R6]], $ @@ -538,11 +538,11 @@ entry: ; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x)( ; ALL: addiu $[[PTR:[0-9]+]], $[[R0]], 1024 -; ALL: $[[BB0:[A-Z_0-9]+]]: +; ALL: [[BB0:(\$|\.L)[A-Z_0-9]+]]: ; ALL: ll $[[R1:[0-9]+]], 0($[[PTR]]) ; ALL: addu $[[R2:[0-9]+]], $[[R1]], $4 ; ALL: sc $[[R2]], 0($[[PTR]]) -; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]] -; MICROMIPS: beqzc $[[R2]], $[[BB0]] -; MIPSR6: beqzc $[[R2]], $[[BB0]] +; NOT-MICROMIPS: beqz $[[R2]], [[BB0]] +; MICROMIPS: beqzc $[[R2]], [[BB0]] +; MIPSR6: beqzc $[[R2]], [[BB0]] } diff --git a/test/CodeGen/Mips/blez_bgez.ll b/test/CodeGen/Mips/blez_bgez.ll index dcda047..84c8af4 100644 --- a/test/CodeGen/Mips/blez_bgez.ll +++ b/test/CodeGen/Mips/blez_bgez.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=mips64el < %s | FileCheck %s ; CHECK-LABEL: test_blez: -; CHECK: blez ${{[0-9]+}}, $BB +; CHECK: blez ${{[0-9]+}}, {{\$|\.L}}BB define void @test_blez(i32 %a) { entry: @@ -20,7 +20,7 @@ if.end: declare void @foo1() ; CHECK-LABEL: test_bgez: -; CHECK: bgez ${{[0-9]+}}, $BB +; CHECK: bgez ${{[0-9]+}}, {{\$|\.L}}BB define void @test_bgez(i32 %a) { entry: diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll index f743637..9bc9a30 100644 --- a/test/CodeGen/Mips/blockaddr.ll +++ b/test/CodeGen/Mips/blockaddr.ll @@ -22,22 +22,22 @@ entry: ; STATIC-O32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]]) ; STATIC-O32: lui $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]]) ; STATIC-O32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]]) -; PIC-N32: lw $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]]) -; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]]) -; PIC-N32: lw $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]]) -; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]]) -; STATIC-N32: lui $[[R2:[0-9]+]], %hi($tmp[[T2:[0-9]+]]) -; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]]) -; STATIC-N32: lui $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]]) -; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]]) -; PIC-N64: ld $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]]) -; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]]) -; PIC-N64: ld $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]]) -; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]]) -; STATIC-N64: ld $[[R2:[0-9]+]], %got_page($tmp[[T2:[0-9]+]]) -; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst($tmp[[T2]]) -; STATIC-N64: ld $[[R3:[0-9]+]], %got_page($tmp[[T3:[0-9]+]]) -; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst($tmp[[T3]]) +; PIC-N32: lw $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]]) +; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]]) +; PIC-N32: lw $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]]) +; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]]) +; STATIC-N32: lui $[[R2:[0-9]+]], %hi(.Ltmp[[T2:[0-9]+]]) +; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo(.Ltmp[[T2]]) +; STATIC-N32: lui $[[R3:[0-9]+]], %hi(.Ltmp[[T3:[0-9]+]]) +; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo(.Ltmp[[T3]]) +; PIC-N64: ld $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]]) +; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]]) +; PIC-N64: ld $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]]) +; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]]) +; STATIC-N64: ld $[[R2:[0-9]+]], %got_page(.Ltmp[[T2:[0-9]+]]) +; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst(.Ltmp[[T2]]) +; STATIC-N64: ld $[[R3:[0-9]+]], %got_page(.Ltmp[[T3:[0-9]+]]) +; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst(.Ltmp[[T3]]) ; STATIC-MIPS16-1: .ent f ; STATIC-MIPS16-2: .ent f ; STATIC-MIPS16-1: li $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]]) diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll index d6d4767..9352294 100644 --- a/test/CodeGen/Mips/ehframe-indirect.ll +++ b/test/CodeGen/Mips/ehframe-indirect.ll @@ -33,9 +33,15 @@ declare void @foo() ; ALL: GCC_except_table{{[0-9]+}}: ; ALL: .byte 155 # @TType Encoding = indirect pcrel sdata4 -; ALL: $[[PC_LABEL:tmp[0-9]+]]: -; ALL: .4byte ($_ZTISt9exception.DW.stub)-($[[PC_LABEL]]) -; ALL: $_ZTISt9exception.DW.stub: +; O32: [[PC_LABEL:\$tmp[0-9]+]]: +; N32: [[PC_LABEL:\.Ltmp[0-9]+]]: +; N64: [[PC_LABEL:\.Ltmp[0-9]+]]: +; O32: .4byte ($_ZTISt9exception.DW.stub)-([[PC_LABEL]]) +; N32: .4byte .L_ZTISt9exception.DW.stub-[[PC_LABEL]] +; N64: .4byte .L_ZTISt9exception.DW.stub-[[PC_LABEL]] +; O32: $_ZTISt9exception.DW.stub: +; N32: .L_ZTISt9exception.DW.stub: +; N64: .L_ZTISt9exception.DW.stub: ; O32: .4byte _ZTISt9exception ; N32: .4byte _ZTISt9exception ; N64: .8byte _ZTISt9exception diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll index 142ee11..bd04ed0 100644 --- a/test/CodeGen/Mips/fcmp.ll +++ b/test/CodeGen/Mips/fcmp.ll @@ -1076,12 +1076,12 @@ entry: ; 32-CMP-DAG: bnezc $[[T4]], ; 64-C-DAG: add.s $[[T0:f[0-9]+]], $f13, $f12 -; 64-C-DAG: lwc1 $[[T1:f[0-9]+]], %got_ofst($CPI32_0)( +; 64-C-DAG: lwc1 $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)( ; 64-C-DAG: c.ole.s $[[T0]], $[[T1]] ; 64-C-DAG: bc1t ; 64-CMP-DAG: add.s $[[T0:f[0-9]+]], $f13, $f12 -; 64-CMP-DAG: lwc1 $[[T1:f[0-9]+]], %got_ofst($CPI32_0)( +; 64-CMP-DAG: lwc1 $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)( ; 64-CMP-DAG: cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]] ; 64-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]] ; FIXME: This instruction is redundant. @@ -1106,8 +1106,8 @@ entry: ; MM64R6-DAG: daddu $[[T1:[0-9]+]], $[[T0]], $25 ; MM64R6-DAG: daddiu $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f32))) ; MM64R6-DAG: add.s $[[T3:f[0-9]+]], $f13, $f12 -; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page($CPI32_0)($[[T2]]) -; MM64R6-DAG: lwc1 $[[T5:f[0-9]+]], %got_ofst($CPI32_0)($[[T4]]) +; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page(.LCPI32_0)($[[T2]]) +; MM64R6-DAG: lwc1 $[[T5:f[0-9]+]], %got_ofst(.LCPI32_0)($[[T4]]) ; MM64R6-DAG: cmp.le.s $[[T6:f[0-9]+]], $[[T3]], $[[T5]] ; MM64R6-DAG: mfc1 $[[T7:[0-9]+]], $[[T6]] ; MM64R6-DAG: andi16 $[[T8:[0-9]+]], $[[T7]], 1 @@ -1145,12 +1145,12 @@ entry: ; 32-CMP-DAG: bnezc $[[T4]], ; 64-C-DAG: add.d $[[T0:f[0-9]+]], $f13, $f12 -; 64-C-DAG: ldc1 $[[T1:f[0-9]+]], %got_ofst($CPI33_0)( +; 64-C-DAG: ldc1 $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)( ; 64-C-DAG: c.ole.d $[[T0]], $[[T1]] ; 64-C-DAG: bc1t ; 64-CMP-DAG: add.d $[[T0:f[0-9]+]], $f13, $f12 -; 64-CMP-DAG: ldc1 $[[T1:f[0-9]+]], %got_ofst($CPI33_0)( +; 64-CMP-DAG: ldc1 $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)( ; 64-CMP-DAG: cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]] ; 64-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]] ; FIXME: This instruction is redundant. @@ -1175,8 +1175,8 @@ entry: ; MM64R6-DAG: daddu $[[T1:[0-9]+]], $[[T0]], $25 ; MM64R6-DAG: daddiu $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f64))) ; MM64R6-DAG: add.d $[[T3:f[0-9]+]], $f13, $f12 -; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page($CPI33_0)($[[T2]]) -; MM64R6-DAG: ldc1 $[[T5:f[0-9]+]], %got_ofst($CPI33_0)($[[T4]]) +; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page(.LCPI33_0)($[[T2]]) +; MM64R6-DAG: ldc1 $[[T5:f[0-9]+]], %got_ofst(.LCPI33_0)($[[T4]]) ; MM64R6-DAG: cmp.le.d $[[T6:f[0-9]+]], $[[T3]], $[[T5]] ; MM64R6-DAG: mfc1 $[[T7:[0-9]+]], $[[T6]] ; MM64R6-DAG: andi16 $[[T8:[0-9]+]], $[[T7]], 1 diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll index bf1b045..7fb508f 100644 --- a/test/CodeGen/Mips/fpbr.ll +++ b/test/CodeGen/Mips/fpbr.ll @@ -10,8 +10,9 @@ entry: ; ALL-LABEL: func0: ; 32-FCC: c.eq.s $f12, $f14 +; 32-FCC: bc1f $BB0_2 ; 64-FCC: c.eq.s $f12, $f13 -; FCC: bc1f $BB0_2 +; 64-FCC: bc1f .LBB0_2 ; 32-GPR: cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f14 ; 64-GPR: cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f13 @@ -19,7 +20,7 @@ entry: ; FIXME: We ought to be able to transform not+bnez -> beqz ; GPR: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: bnez $[[GPRCC]], $BB0_2 -; 64-GPR: bnezc $[[GPRCC]], $BB0_2 +; 64-GPR: bnezc $[[GPRCC]], .LBB0_2 %cmp = fcmp oeq float %f2, %f3 br i1 %cmp, label %if.then, label %if.else @@ -45,15 +46,16 @@ entry: ; ALL-LABEL: func1: ; 32-FCC: c.olt.s $f12, $f14 +; 32-FCC: bc1f $BB1_2 ; 64-FCC: c.olt.s $f12, $f13 -; FCC: bc1f $BB1_2 +; 64-FCC: bc1f .LBB1_2 ; 32-GPR: cmp.ule.s $[[FGRCC:f[0-9]+]], $f14, $f12 ; 64-GPR: cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: bnez $[[GPRCC]], $BB1_2 -; 64-GPR: bnezc $[[GPRCC]], $BB1_2 +; 64-GPR: bnezc $[[GPRCC]], .LBB1_2 %cmp = fcmp olt float %f2, %f3 br i1 %cmp, label %if.then, label %if.else @@ -75,15 +77,16 @@ entry: ; ALL-LABEL: func2: ; 32-FCC: c.ole.s $f12, $f14 +; 32-FCC: bc1t $BB2_2 ; 64-FCC: c.ole.s $f12, $f13 -; FCC: bc1t $BB2_2 +; 64-FCC: bc1t .LBB2_2 ; 32-GPR: cmp.ult.s $[[FGRCC:f[0-9]+]], $f14, $f12 ; 64-GPR: cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: beqz $[[GPRCC]], $BB2_2 -; 64-GPR: beqzc $[[GPRCC]], $BB2_2 +; 64-GPR: beqzc $[[GPRCC]], .LBB2_2 %cmp = fcmp ugt float %f2, %f3 br i1 %cmp, label %if.else, label %if.then @@ -105,8 +108,9 @@ entry: ; ALL-LABEL: func3: ; 32-FCC: c.eq.d $f12, $f14 +; 32-FCC: bc1f $BB3_2 ; 64-FCC: c.eq.d $f12, $f13 -; FCC: bc1f $BB3_2 +; 64-FCC: bc1f .LBB3_2 ; 32-GPR: cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f14 ; 64-GPR: cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f13 @@ -114,7 +118,7 @@ entry: ; FIXME: We ought to be able to transform not+bnez -> beqz ; GPR: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: bnez $[[GPRCC]], $BB3_2 -; 64-GPR: bnezc $[[GPRCC]], $BB3_2 +; 64-GPR: bnezc $[[GPRCC]], .LBB3_2 %cmp = fcmp oeq double %f2, %f3 br i1 %cmp, label %if.then, label %if.else @@ -136,15 +140,16 @@ entry: ; ALL-LABEL: func4: ; 32-FCC: c.olt.d $f12, $f14 +; 32-FCC: bc1f $BB4_2 ; 64-FCC: c.olt.d $f12, $f13 -; FCC: bc1f $BB4_2 +; 64-FCC: bc1f .LBB4_2 ; 32-GPR: cmp.ule.d $[[FGRCC:f[0-9]+]], $f14, $f12 ; 64-GPR: cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: bnez $[[GPRCC]], $BB4_2 -; 64-GPR: bnezc $[[GPRCC]], $BB4_2 +; 64-GPR: bnezc $[[GPRCC]], .LBB4_2 %cmp = fcmp olt double %f2, %f3 br i1 %cmp, label %if.then, label %if.else @@ -166,15 +171,16 @@ entry: ; ALL-LABEL: func5: ; 32-FCC: c.ole.d $f12, $f14 +; 32-FCC: bc1t $BB5_2 ; 64-FCC: c.ole.d $f12, $f13 -; FCC: bc1t $BB5_2 +; 64-FCC: bc1t .LBB5_2 ; 32-GPR: cmp.ult.d $[[FGRCC:f[0-9]+]], $f14, $f12 ; 64-GPR: cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12 ; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]] ; GPR-NOT: not $[[GPRCC]], $[[GPRCC]] ; 32-GPR: beqz $[[GPRCC]], $BB5_2 -; 64-GPR: beqzc $[[GPRCC]], $BB5_2 +; 64-GPR: beqzc $[[GPRCC]], .LBB5_2 %cmp = fcmp ugt double %f2, %f3 br i1 %cmp, label %if.else, label %if.then diff --git a/test/CodeGen/Mips/jumptable_labels.ll b/test/CodeGen/Mips/jumptable_labels.ll new file mode 100644 index 0000000..8c7edc1 --- /dev/null +++ b/test/CodeGen/Mips/jumptable_labels.ll @@ -0,0 +1,75 @@ +; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32 +; RUN: llc -march=mips64 -target-abi=n32 < %s | FileCheck %s -check-prefix=N32 +; RUN: llc -march=mips64 < %s | FileCheck %s -check-prefix=N64 + +; We only use the '$' prefix on O32. The others use the ELF convention. +; O32: $JTI0_0 +; N32: .LJTI0_0 +; N64: .LJTI0_0 + +; Check basic block labels while we're at it. +; O32: $BB0_2: +; N32: .LBB0_2: +; N64: .LBB0_2: + +@.str = private unnamed_addr constant [2 x i8] c"A\00", align 1 +@.str.1 = private unnamed_addr constant [2 x i8] c"B\00", align 1 +@.str.2 = private unnamed_addr constant [2 x i8] c"C\00", align 1 +@.str.3 = private unnamed_addr constant [2 x i8] c"D\00", align 1 +@.str.4 = private unnamed_addr constant [2 x i8] c"E\00", align 1 +@.str.5 = private unnamed_addr constant [2 x i8] c"F\00", align 1 +@.str.6 = private unnamed_addr constant [2 x i8] c"G\00", align 1 +@.str.7 = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 + +define i8* @_Z3fooi(i32 signext %Letter) { +entry: + %retval = alloca i8*, align 8 + %Letter.addr = alloca i32, align 4 + store i32 %Letter, i32* %Letter.addr, align 4 + %0 = load i32, i32* %Letter.addr, align 4 + switch i32 %0, label %sw.epilog [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + i32 4, label %sw.bb4 + i32 5, label %sw.bb5 + i32 6, label %sw.bb6 + ] + +sw.bb: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb1: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb2: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb3: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb4: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb5: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.5, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.bb6: + store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.6, i32 0, i32 0), i8** %retval, align 8 + br label %return + +sw.epilog: + store i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str.7, i32 0, i32 0), i8** %retval, align 8 + br label %return + +return: + %1 = load i8*, i8** %retval, align 8 + ret i8* %1 +} diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll index af9b81f..cfb9855 100644 --- a/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -167,18 +167,18 @@ entry: ; M3: sll $[[T0:[0-9]+]], $7, 0 ; M3: dsrav $[[T1:[0-9]+]], $4, $7 ; M3: andi $[[T2:[0-9]+]], $[[T0]], 64 - ; M3: bnez $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]] + ; M3: bnez $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]] ; M3: move $3, $[[T1]] ; M3: dsrlv $[[T4:[0-9]+]], $5, $7 ; M3: dsll $[[T5:[0-9]+]], $4, 1 ; M3: not $[[T6:[0-9]+]], $[[T0]] ; M3: dsllv $[[T7:[0-9]+]], $[[T5]], $[[T6]] ; M3: or $3, $[[T7]], $[[T4]] - ; M3: $[[BB0]]: - ; M3: beqz $[[T3]], $[[BB1:BB[0-9_]+]] + ; M3: [[BB0]]: + ; M3: beqz $[[T3]], [[BB1:.LBB[0-9_]+]] ; M3: nop ; M3: dsra $2, $4, 63 - ; M3: $[[BB1]]: + ; M3: [[BB1]]: ; M3: jr $ra ; M3: nop diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll index d982b57..8fed32a 100644 --- a/test/CodeGen/Mips/llvm-ir/indirectbr.ll +++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll @@ -18,13 +18,13 @@ define i32 @br(i8 *%addr) { ; R6C: jrc $4 # , <16 x i8>*@v16i8 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.b [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <16 x i8> , <16 x i8>*@v16i8 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.b [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <16 x i8> , <16 x i8>*@v16i8 @@ -59,8 +59,8 @@ define void @const_v16i8() nounwind { store volatile <16 x i8> , <16 x i8>*@v16i8 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.b [[R1:\$w[0-9]+]], 0([[G_PTR]]) ret void @@ -77,8 +77,8 @@ define void @const_v8i16() nounwind { store volatile <8 x i16> , <8 x i16>*@v8i16 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.h [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <8 x i16> , <8 x i16>*@v8i16 @@ -93,8 +93,8 @@ define void @const_v8i16() nounwind { store volatile <8 x i16> , <8 x i16>*@v8i16 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.h [[R1:\$w[0-9]+]], 0([[G_PTR]]) ret void @@ -111,8 +111,8 @@ define void @const_v4i32() nounwind { store volatile <4 x i32> , <4 x i32>*@v4i32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <4 x i32> , <4 x i32>*@v4i32 @@ -123,14 +123,14 @@ define void @const_v4i32() nounwind { store volatile <4 x i32> , <4 x i32>*@v4i32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <4 x i32> , <4 x i32>*@v4i32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) ret void @@ -156,15 +156,15 @@ define void @const_v2i64() nounwind { store volatile <2 x i64> , <2 x i64>*@v2i64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; MIPS32: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) ; MIPS64: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x i64> , <2 x i64>*@v2i64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; MIPS32: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) ; MIPS64: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll index d714b3e..1546878 100644 --- a/test/CodeGen/Mips/msa/basic_operations_float.ll +++ b/test/CodeGen/Mips/msa/basic_operations_float.ll @@ -23,8 +23,8 @@ define void @const_v4f32() nounwind { store volatile <4 x float> , <4 x float>*@v4f32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <4 x float> , <4 x float>*@v4f32 @@ -34,14 +34,14 @@ define void @const_v4f32() nounwind { store volatile <4 x float> , <4 x float>*@v4f32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <4 x float> , <4 x float>*@v4f32 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]]) ret void @@ -55,38 +55,38 @@ define void @const_v2f64() nounwind { store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) store volatile <2 x double> , <2 x double>*@v2f64 ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($ - ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ - ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($ + ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L + ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L ; ALL: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]]) ret void diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll index b441274..7e2a810 100644 --- a/test/CodeGen/Mips/octeon.ll +++ b/test/CodeGen/Mips/octeon.ll @@ -91,9 +91,9 @@ entry: define i64 @bbit0(i64 %a) nounwind { entry: ; ALL-LABEL: bbit0: -; OCTEON: bbit0 $4, 3, $[[BB0:BB[0-9_]+]] +; OCTEON: bbit0 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]] ; MIPS64: andi $[[T0:[0-9]+]], $4, 8 -; MIPS64: bnez $[[T0]], $[[BB0:BB[0-9_]+]] +; MIPS64: bnez $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]] %bit = and i64 %a, 8 %res = icmp eq i64 %bit, 0 br i1 %res, label %endif, label %if @@ -107,11 +107,11 @@ endif: define i64 @bbit032(i64 %a) nounwind { entry: ; ALL-LABEL: bbit032: -; OCTEON: bbit032 $4, 3, $[[BB0:BB[0-9_]+]] +; OCTEON: bbit032 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]] ; MIPS64: daddiu $[[T0:[0-9]+]], $zero, 1 ; MIPS64: dsll $[[T1:[0-9]+]], $[[T0]], 35 ; MIPS64: and $[[T2:[0-9]+]], $4, $[[T1]] -; MIPS64: bnez $[[T2]], $[[BB0:BB[0-9_]+]] +; MIPS64: bnez $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]] %bit = and i64 %a, 34359738368 %res = icmp eq i64 %bit, 0 br i1 %res, label %endif, label %if @@ -125,9 +125,9 @@ endif: define i64 @bbit1(i64 %a) nounwind { entry: ; ALL-LABEL: bbit1: -; OCTEON: bbit1 $4, 3, $[[BB0:BB[0-9_]+]] +; OCTEON: bbit1 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]] ; MIPS64: andi $[[T0:[0-9]+]], $4, 8 -; MIPS64: beqz $[[T0]], $[[BB0:BB[0-9_]+]] +; MIPS64: beqz $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]] %bit = and i64 %a, 8 %res = icmp ne i64 %bit, 0 br i1 %res, label %endif, label %if @@ -141,11 +141,11 @@ endif: define i64 @bbit132(i64 %a) nounwind { entry: ; ALL-LABEL: bbit132: -; OCTEON: bbit132 $4, 3, $[[BB0:BB[0-9_]+]] +; OCTEON: bbit132 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]] ; MIPS64: daddiu $[[T0:[0-9]+]], $zero, 1 ; MIPS64: dsll $[[T1:[0-9]+]], $[[T0]], 35 ; MIPS64: and $[[T2:[0-9]+]], $4, $[[T1]] -; MIPS64: beqz $[[T2]], $[[BB0:BB[0-9_]+]] +; MIPS64: beqz $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]] %bit = and i64 %a, 34359738368 %res = icmp ne i64 %bit, 0 br i1 %res, label %endif, label %if diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index c7cf857..f886e1f 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind { ; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq - %cvt = fptosi <4 x double> %a0 to <4 x i32> + %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) %res = bitcast <4 x i32> %cvt to <2 x i64> ret <2 x i64> %res } +declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { ; X32-LABEL: test_mm256_cvttps_epi32: @@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 ; X64-NEXT: retq - %cvt = fptosi <8 x float> %a0 to <8 x i32> + %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) %res = bitcast <8 x i32> %cvt to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind { ; X32-LABEL: test_mm256_div_pd: diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index a7b4c6b..0630fd8 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) { declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone -define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone - - -define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-NEXT: retl - %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone - - define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { ; add operation forces the execution domain. ; CHECK-LABEL: test_x86_sse2_storeu_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqu %xmm0, (%eax) ; CHECK-NEXT: retl %a2 = add <16 x i8> %a1, diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 3576329..c5d60da 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) { declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone +define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) { +; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256: +; AVX: ## BB#0: +; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0 +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone + + +define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) { +; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256: +; AVX: ## BB#0: +; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone + + define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { ; AVX-LABEL: test_x86_avx_dp_ps_256: ; AVX: ## BB#0: @@ -4552,7 +4585,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX-LABEL: movnt_dq: ; AVX: ## BB#0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0 +; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0 ; AVX-NEXT: vmovntdq %ymm0, (%eax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retl @@ -4560,7 +4593,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX512VL-LABEL: movnt_dq: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <2 x i64> %a1, diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 914f859..d2410e4 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -744,6 +744,36 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) { ret <8 x double> %1 } +define <16 x double> @sitofp_16i1_double(<16 x double> %a) { +; KNL-LABEL: sitofp_16i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; KNL-NEXT: vcmpltpd %zmm0, %zmm2, %k2 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm1, %ymm1 +; KNL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_16i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0 +; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1 +; SKX-NEXT: vpmovm2d %k1, %ymm0 +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 +; SKX-NEXT: vpmovm2d %k0, %ymm1 +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 +; SKX-NEXT: retq + %cmpres = fcmp ogt <16 x double> %a, zeroinitializer + %1 = sitofp <16 x i1> %cmpres to <16 x double> + ret <16 x double> %1 +} + define <8 x double> @sitofp_8i1_double(<8 x double> %a) { ; KNL-LABEL: sitofp_8i1_double: ; KNL: ## BB#0: @@ -767,6 +797,130 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) { ret <8 x double> %1 } +define <8 x float> @sitofp_8i1_float(<8 x float> %a) { +; KNL-LABEL: sitofp_8i1_float: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_8i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0 +; SKX-NEXT: vpmovm2d %k0, %ymm0 +; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; SKX-NEXT: retq + %cmpres = fcmp ogt <8 x float> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x float> + ret <8 x float> %1 +} + +define <4 x float> @sitofp_4i1_float(<4 x float> %a) { +; KNL-LABEL: sitofp_4i1_float: +; KNL: ## BB#0: +; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_4i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; SKX-NEXT: retq + %cmpres = fcmp ogt <4 x float> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @sitofp_4i1_double(<4 x double> %a) { +; KNL-LABEL: sitofp_4i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vpslld $31, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 +; KNL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_4i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vcmpltpd %ymm0, %ymm1, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; SKX-NEXT: retq + %cmpres = fcmp ogt <4 x double> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @sitofp_2i1_float(<2 x float> %a) { +; KNL-LABEL: sitofp_2i1_float: +; KNL: ## BB#0: +; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; KNL-NEXT: vpsllq $32, %xmm0, %xmm0 +; KNL-NEXT: vpsrad $31, %xmm0, %xmm1 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: xorl %ecx, %ecx +; KNL-NEXT: testb $1, %al +; KNL-NEXT: movl $-1, %eax +; KNL-NEXT: movl $0, %edx +; KNL-NEXT: cmovnel %eax, %edx +; KNL-NEXT: vcvtsi2ssl %edx, %xmm0, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rdx +; KNL-NEXT: testb $1, %dl +; KNL-NEXT: cmovnel %eax, %ecx +; KNL-NEXT: vcvtsi2ssl %ecx, %xmm0, %xmm0 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_2i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; SKX-NEXT: retq + %cmpres = fcmp ogt <2 x float> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @sitofp_2i1_double(<2 x double> %a) { +; KNL-LABEL: sitofp_2i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: sitofp_2i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 +; SKX-NEXT: retq + %cmpres = fcmp ogt <2 x double> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x double> + ret <2 x double> %1 +} + define <16 x float> @uitofp_16i8(<16 x i8>%a) { ; ALL-LABEL: uitofp_16i8: ; ALL: ## BB#0: @@ -787,3 +941,196 @@ define <16 x float> @uitofp_16i16(<16 x i16>%a) { ret <16 x float>%b } +define <16 x float> @uitofp_16i1_float(<16 x i32> %a) { +; ALL-LABEL: uitofp_16i1_float: +; ALL: ## BB#0: +; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; ALL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x double> @uitofp_16i1_double(<16 x i32> %a) { +; KNL-LABEL: uitofp_16i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; KNL-NEXT: movq {{.*}}(%rip), %rax +; KNL-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; KNL-NEXT: kshiftrw $8, %k1, %k1 +; KNL-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm1, %ymm1 +; KNL-NEXT: vcvtudq2pd %ymm1, %zmm1 +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_16i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; SKX-NEXT: movl {{.*}}(%rip), %eax +; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} +; SKX-NEXT: vcvtudq2pd %ymm1, %zmm1 +; SKX-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + ret <16 x double> %1 +} + +define <8 x float> @uitofp_8i1_float(<8 x i32> %a) { +; KNL-LABEL: uitofp_8i1_float: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_8i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0 +; SKX-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x float> + ret <8 x float> %1 +} + +define <8 x double> @uitofp_8i1_double(<8 x i32> %a) { +; KNL-LABEL: uitofp_8i1_double: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_8i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 +; SKX-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x double> + ret <8 x double> %1 +} + +define <4 x float> @uitofp_4i1_float(<4 x i32> %a) { +; KNL-LABEL: uitofp_4i1_float: +; KNL: ## BB#0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpsrld $31, %xmm0, %xmm0 +; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_4i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 +; SKX-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @uitofp_4i1_double(<4 x i32> %a) { +; KNL-LABEL: uitofp_4i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpsrld $31, %xmm0, %xmm0 +; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_4i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0 +; SKX-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @uitofp_2i1_float(<2 x i32> %a) { +; KNL-LABEL: uitofp_2i1_float: +; KNL: ## BB#0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_2i1_float: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 +; SKX-NEXT: retq + %mask = icmp ult <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @uitofp_2i1_double(<2 x i32> %a) { +; KNL-LABEL: uitofp_2i1_double: +; KNL: ## BB#0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0 +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm0 +; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; KNL-NEXT: retq +; +; SKX-LABEL: uitofp_2i1_double: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vcvtuqq2pd %xmm0, %xmm0 +; SKX-NEXT: retq + %mask = icmp ult <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x double> + ret <2 x double> %1 +} diff --git a/test/CodeGen/X86/pr28504.ll b/test/CodeGen/X86/pr28504.ll new file mode 100644 index 0000000..a617c8a --- /dev/null +++ b/test/CodeGen/X86/pr28504.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +; The test case is rather involved, because we need to get to a state where +; We have a sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) combine, +; BUT this combine is only triggered post-legalization, so the setcc's return +; type is i8. So we can't have the combine opportunity be exposed too early. +; Basically, what we want to see is that the compare result zero-extended, and +; then stored. Only one zext, and no sexts. + +; CHECK-LABEL: main: +; CHECK: movzbl (%rdi), %[[EAX:.*]] +; CHECK-NEXT: xorl %e[[C:.]]x, %e[[C]]x +; CHECK-NEXT: cmpl $1, %[[EAX]] +; CHECK-NEXT: sete %[[C]]l +; CHECK-NEXT: movl %e[[C]]x, (%rsi) +define void @main(i8* %p, i32* %q) { +bb: + %tmp4 = load i8, i8* %p, align 1 + %tmp5 = sext i8 %tmp4 to i32 + %tmp6 = load i8, i8* %p, align 1 + %tmp7 = zext i8 %tmp6 to i32 + %tmp8 = sub nsw i32 %tmp5, %tmp7 + %tmp11 = icmp eq i32 %tmp7, 1 + %tmp12 = zext i1 %tmp11 to i32 + %tmp13 = add nsw i32 %tmp8, %tmp12 + %tmp14 = trunc i32 %tmp13 to i8 + %tmp15 = sext i8 %tmp14 to i16 + %tmp16 = sext i16 %tmp15 to i32 + store i32 %tmp16, i32* %q, align 4 + br i1 %tmp11, label %bb21, label %bb22 + +bb21: ; preds = %bb + unreachable + +bb22: ; preds = %bb + ret void +} diff --git a/test/CodeGen/X86/pr28824.ll b/test/CodeGen/X86/pr28824.ll new file mode 100644 index 0000000..ced1f00 --- /dev/null +++ b/test/CodeGen/X86/pr28824.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +@d = global i32 0, align 4 + +; Verify the sar happens before ecx is clobbered with the parameter being +; passed to fn3 +; CHECK-LABEL: fn4 +; CHECK: movb d, %cl +; CHECK: sarl %cl +; CHECK: movl $2, %ecx +define i32 @fn4(i32 %i) #0 { +entry: + %0 = load i32, i32* @d, align 4 + %shr = ashr i32 %i, %0 + tail call fastcc void @fn3(i32 2, i32 5, i32 %shr, i32 %i) + %cmp = icmp slt i32 %shr, 1 + %. = zext i1 %cmp to i32 + ret i32 %. +} + +declare void @fn3(i32 %p1, i32 %p2, i32 %p3, i32 %p4) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll index 2102b42..aad00e7 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll @@ -6,13 +6,12 @@ define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind { ; X64-LABEL: test_mm_cvtsi64_ss: ; X64: # BB#0: -; X64-NEXT: cvtsi2ssq %rdi, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i64 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 + %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ret <4 x float> %res } +declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind { ; X64-LABEL: test_mm_cvtss_si64: @@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i64 + %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ret i64 %res } +declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 090ddfd..4715b7f 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind { ; X32-LABEL: test_mm_cvtsi32_ss: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: cvtsi2ssl %eax, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_ss: ; X64: # BB#0: -; X64-NEXT: cvtsi2ssl %edi, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: cvtsi2ssl %edi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i32 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 + %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1) ret <4 x float> %res } +declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm_cvtss_f32: @@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i32 + %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } +declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm_cvttss_si32: @@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: retq - %cvt = extractelement <4 x float> %a0, i32 0 - %res = fptosi float %cvt to i32 + %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll index f5ecfa4..6b9dc40 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll @@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind { define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind { ; X64-LABEL: test_mm_cvtsi64_sd: ; X64: # BB#0: -; X64-NEXT: cvtsi2sdq %rdi, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: cvtsi2sdq %rdi, %xmm0 ; X64-NEXT: retq - %cvt = sitofp i64 %a1 to double - %res = insertelement <2 x double> %a0, double %cvt, i32 0 + %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ret <2 x double> %res } +declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind { ; X64-LABEL: test_mm_cvtsi64_si128: @@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttsd2si %xmm0, %rax ; X64-NEXT: retq - %ext = extractelement <2 x double> %a0, i32 0 - %res = fptosi double %ext to i64 + %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ret i64 %res } +declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind { ; X64-LABEL: test_mm_loadu_si64: diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index fa71325..d3ebba9 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1208,6 +1208,39 @@ define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind { } declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone +define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) { +; X32-LABEL: test_mm_cvtsd_ss: +; X32: # BB#0: +; X32-NEXT: cvtsd2ss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtsd_ss: +; X64: # BB#0: +; X64-NEXT: cvtsd2ss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone + +define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) { +; X32-LABEL: test_mm_cvtsd_ss_load: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps (%eax), %xmm1 +; X32-NEXT: cvtsd2ss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtsd_ss_load: +; X64: # BB#0: +; X64-NEXT: movaps (%rdi), %xmm1 +; X64-NEXT: cvtsd2ss %xmm1, %xmm0 +; X64-NEXT: retq + %a1 = load <2 x double>, <2 x double>* %p1 + %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) + ret <4 x float> %res +} + define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm_cvtsi128_si32: ; X32: # BB#0: @@ -1303,10 +1336,11 @@ define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttps2dq %xmm0, %xmm0 ; X64-NEXT: retq - %res = fptosi <4 x float> %a0 to <4 x i32> + %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) %bc = bitcast <4 x i32> %res to <2 x i64> ret <2 x i64> %bc } +declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind { ; X32-LABEL: test_mm_cvttsd_si32: @@ -1318,10 +1352,10 @@ define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind { ; X64: # BB#0: ; X64-NEXT: cvttsd2si %xmm0, %eax ; X64-NEXT: retq - %ext = extractelement <2 x double> %a0, i32 0 - %res = fptosi double %ext to i32 + %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ret i32 %res } +declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_div_pd: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index ae6626b..27a3fce 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -66,17 +66,6 @@ define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) { declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone -define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) { -; CHECK-LABEL: test_x86_sse2_cvttps2dq: -; CHECK: ## BB#0: -; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone - - define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_sse2_storel_dq: ; CHECK: ## BB#0: @@ -94,7 +83,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) { ; CHECK-LABEL: test_x86_sse2_storeu_dq: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: paddb LCPI8_0, %xmm0 +; CHECK-NEXT: paddb LCPI7_0, %xmm0 ; CHECK-NEXT: movdqu %xmm0, (%eax) ; CHECK-NEXT: retl %a2 = add <16 x i8> %a1, diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 617e30e..3ae3aec 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL @@ -274,6 +274,25 @@ define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) { declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone +define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) { +; SSE-LABEL: test_x86_sse2_cvtsd2ss_load: +; SSE: ## BB#0: +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movaps (%eax), %xmm1 +; SSE-NEXT: cvtsd2ss %xmm1, %xmm0 +; SSE-NEXT: retl +; +; KNL-LABEL: test_x86_sse2_cvtsd2ss_load: +; KNL: ## BB#0: +; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL-NEXT: vcvtsd2ss (%eax), %xmm0, %xmm0 +; KNL-NEXT: retl + %a1 = load <2 x double>, <2 x double>* %p1 + %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} + + define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) { ; SSE-LABEL: test_x86_sse2_cvtsi2sd: ; SSE: ## BB#0: @@ -306,6 +325,25 @@ define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) { declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone +define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) { +; SSE-LABEL: test_x86_sse2_cvtss2sd_load: +; SSE: ## BB#0: +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: movaps (%eax), %xmm1 +; SSE-NEXT: cvtss2sd %xmm1, %xmm0 +; SSE-NEXT: retl +; +; KNL-LABEL: test_x86_sse2_cvtss2sd_load: +; KNL: ## BB#0: +; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL-NEXT: vcvtss2sd (%eax), %xmm0, %xmm0 +; KNL-NEXT: retl + %a1 = load <4 x float>, <4 x float>* %p1 + %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} + + define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_cvttpd2dq: ; SSE: ## BB#0: @@ -322,6 +360,22 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) { declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone +define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) { +; SSE-LABEL: test_x86_sse2_cvttps2dq: +; SSE: ## BB#0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retl +; +; KNL-LABEL: test_x86_sse2_cvttps2dq: +; KNL: ## BB#0: +; KNL-NEXT: vcvttps2dq %xmm0, %xmm0 +; KNL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone + + define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_cvttsd2si: ; SSE: ## BB#0: diff --git a/test/CodeGen/X86/tail-merge-after-mbp.ll b/test/CodeGen/X86/tail-merge-after-mbp.ll new file mode 100644 index 0000000..dc5f3a1 --- /dev/null +++ b/test/CodeGen/X86/tail-merge-after-mbp.ll @@ -0,0 +1,94 @@ +; RUN: llc -mtriple=x86_64-linux -o - %s | FileCheck %s + +%0 = type { %1, %3* } +%1 = type { %2* } +%2 = type { %2*, i8* } +%3 = type { i32, i32 (i32, i32)* } + + +declare i32 @Up(...) +declare i32 @f(i32, i32) + +; check loop block_14 is not merged with block_21 +; check loop block_11 is not merged with block_18, block_25 +define i32 @foo(%0* nocapture readonly, i32, i1 %c, i8* %p1, %2** %p2) { +; CHECK-LABEL: foo: +; CHECK: # %block_11 +; CHECK-NEXT: movq (%r14), %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: je +; CHECK-NEXT:# %block_14 +; CHECK-NEXT: cmpq $0, 8(%rax) +; CHECK-NEXT: jne +; CHECK-NEXT:# %block_18 +; CHECK-NEXT: movq (%r14), %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: je +; CHECK-NEXT:# %block_21 +; CHECK-NEXT:# =>This Inner Loop Header +; CHECK-NEXT: cmpq $0, 8(%rax) +; CHECK-NEXT: jne +; CHECK-NEXT:# %block_25 +; CHECK-NEXT:# in Loop +; CHECK-NEXT: movq (%r14), %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jne + br i1 %c, label %block_34, label %block_3 + +block_3: ; preds = %2 + br i1 %c, label %block_7, label %block_4 + +block_4: ; preds = %block_3 + %a5 = tail call i32 @f(i32 undef, i32 undef) + %a6 = icmp eq i32 %a5, 0 + br i1 %a6, label %block_7, label %block_34 + +block_7: ; preds = %block_4, %block_3 + %a8 = icmp eq %2* null, null + br i1 %a8, label %block_34, label %block_9 + +block_9: ; preds = %block_7 + %a10 = icmp eq i8* %p1, null + br i1 %a10, label %block_11, label %block_32 + +block_11: ; preds = %block_9 + %a12 = load %2*, %2** %p2, align 8 + %a13 = icmp eq %2* %a12, null + br i1 %a13, label %block_34, label %block_14 + +block_14: ; preds = %block_11 + %a15 = getelementptr inbounds %2, %2* %a12, i64 0, i32 1 + %a16 = load i8*, i8** %a15, align 8 + %a17 = icmp eq i8* %a16, null + br i1 %a17, label %block_18, label %block_32 + +block_18: ; preds = %block_14 + %a19 = load %2*, %2** %p2, align 8 + %a20 = icmp eq %2* %a19, null + br i1 %a20, label %block_34, label %block_21 + +block_21: ; preds = %block_18 + %a22 = getelementptr inbounds %2, %2* %a19, i64 0, i32 1 + %a23 = load i8*, i8** %a22, align 8 + %a24 = icmp eq i8* %a23, null + br i1 %a24, label %block_25, label %block_32 + +block_25: ; preds = %block_28, %block_21 + %a26 = load %2*, %2** %p2, align 8 + %a27 = icmp eq %2* %a26, null + br i1 %a27, label %block_34, label %block_28 + +block_28: ; preds = %block_25 + %a29 = getelementptr inbounds %2, %2* %a26, i64 0, i32 1 + %a30 = load i8*, i8** %a29, align 8 + %a31 = icmp eq i8* %a30, null + br i1 %a31, label %block_25, label %block_32 + +block_32: ; preds = %block_28, %block_21, %block_14, %block_9 + %a33 = tail call i32 (...) @Up() + br label %block_34 + +block_34: ; preds = %block_32, %block_25, %block_18, %block_11, %block_7, %block_4, %2 + %a35 = phi i32 [ 0, %2 ], [ %a5, %block_4 ], [ 0, %block_7 ], [ 0, %block_11 ], [ 0, %block_32 ], [ 0, %block_18 ], [ 0, %block_25 ] + ret i32 %a35 +} diff --git a/test/CodeGen/X86/twoaddr-lea.ll b/test/CodeGen/X86/twoaddr-lea.ll index 5779cf3..2944b17 100644 --- a/test/CodeGen/X86/twoaddr-lea.ll +++ b/test/CodeGen/X86/twoaddr-lea.ll @@ -44,3 +44,60 @@ entry: %0 = shl i64 %x, 1 ret i64 %0 } + +@global = external global i32, align 4 +@global2 = external global i64, align 8 + +; Test that liveness is properly updated and we do not encounter the +; assert/crash from http://llvm.org/PR28301 +; CHECK-LABEL: ham +define void @ham() { +bb: + br label %bb1 + +bb1: + %tmp = phi i64 [ %tmp40, %bb9 ], [ 0, %bb ] + %tmp2 = phi i32 [ %tmp39, %bb9 ], [ 0, %bb ] + %tmp3 = icmp sgt i32 undef, 10 + br i1 %tmp3, label %bb2, label %bb3 + +bb2: + %tmp6 = load i32, i32* @global, align 4 + %tmp8 = add nsw i32 %tmp6, %tmp2 + %tmp9 = sext i32 %tmp8 to i64 + br label %bb6 + +bb3: +; CHECK: subl %e[[REG0:[a-z0-9]+]], +; CHECK: leaq 4({{%[a-z0-9]+}}), %r[[REG0]] + %tmp14 = phi i64 [ %tmp15, %bb5 ], [ 0, %bb1 ] + %tmp15 = add nuw i64 %tmp14, 4 + %tmp16 = trunc i64 %tmp14 to i32 + %tmp17 = sub i32 %tmp2, %tmp16 + br label %bb4 + +bb4: + %tmp20 = phi i64 [ %tmp14, %bb3 ], [ %tmp34, %bb5 ] + %tmp28 = icmp eq i32 %tmp17, 0 + br i1 %tmp28, label %bb5, label %bb8 + +bb5: + %tmp34 = add nuw nsw i64 %tmp20, 1 + %tmp35 = icmp slt i64 %tmp34, %tmp15 + br i1 %tmp35, label %bb4, label %bb3 + +bb6: + store volatile i64 %tmp, i64* @global2, align 8 + store volatile i64 %tmp9, i64* @global2, align 8 + store volatile i32 %tmp6, i32* @global, align 4 + %tmp45 = icmp slt i32 undef, undef + br i1 %tmp45, label %bb6, label %bb9 + +bb8: + unreachable + +bb9: + %tmp39 = add nuw nsw i32 %tmp2, 4 + %tmp40 = add nuw i64 %tmp, 4 + br label %bb1 +} diff --git a/test/DebugInfo/COFF/inlining-same-name.ll b/test/DebugInfo/COFF/inlining-same-name.ll index 44b8791..fda5a6d 100644 --- a/test/DebugInfo/COFF/inlining-same-name.ll +++ b/test/DebugInfo/COFF/inlining-same-name.ll @@ -33,7 +33,7 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" -define void @main(i32* %i.i) { +define void @main(i32* %i.i) !dbg !16 { store volatile i32 3, i32* %i.i, !dbg !6 store volatile i32 3, i32* %i.i, !dbg !19 ret void diff --git a/test/DebugInfo/COFF/pr28747.ll b/test/DebugInfo/COFF/pr28747.ll new file mode 100644 index 0000000..d19a2fa --- /dev/null +++ b/test/DebugInfo/COFF/pr28747.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK: .section .debug$S,"dr"{{$}} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .cv_filechecksums +; CHECK-NEXT: .cv_stringtable + +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc18.0.0" + +define void @baz() { +entry: + %x.i.i = alloca i32, align 4 + call void @llvm.dbg.declare(metadata i32* %x.i.i, metadata !6, metadata !12), !dbg !13 + store i32 5, i32* %x.i.i, align 4, !dbg !13 + ret void +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 + +attributes #0 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 276756) (llvm/trunk 276952)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "-", directory: "/") +!2 = !{} +!3 = !{i32 2, !"CodeView", i32 1} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!"clang version 4.0.0 (trunk 276756) (llvm/trunk 276952)"} +!6 = !DILocalVariable(name: "x", scope: !7, file: !8, line: 1, type: !11) +!7 = distinct !DISubprogram(name: "foo", scope: !8, file: !8, line: 1, type: !9, isLocal: true, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, variables: !2) +!8 = !DIFile(filename: "", directory: "/") +!9 = !DISubroutineType(types: !10) +!10 = !{null} +!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!12 = !DIExpression() +!13 = !DILocation(line: 1, column: 56, scope: !7, inlinedAt: !14) +!14 = distinct !DILocation(line: 2, column: 52, scope: !15) +!15 = distinct !DISubprogram(name: "bar", scope: !8, file: !8, line: 2, type: !9, isLocal: true, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2) diff --git a/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll b/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll index db12ec7..494df83 100644 --- a/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll +++ b/test/Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll @@ -13,6 +13,8 @@ target triple = "x86_64-apple-macosx10.9" @__llvm_gcov_ctr = internal global [1 x i64] zeroinitializer @__llvm_gcov_ctr.1 = internal global [1 x i64] zeroinitializer +@__llvm_gcov_global_state_pred = internal global i32 0 +@__llvm_gcda_foo = internal global i32 0 define i32 @test_gep() sanitize_thread { entry: @@ -42,5 +44,16 @@ entry: ret i32 undef } +define void @test_load() sanitize_thread { +entry: + %0 = load i32, i32* @__llvm_gcov_global_state_pred + store i32 1, i32* @__llvm_gcov_global_state_pred + + %1 = load i32, i32* @__llvm_gcda_foo + store i32 1, i32* @__llvm_gcda_foo + + ret void +} + ; CHECK-NOT: {{call void @__tsan_write}} ; CHECK: __tsan_init diff --git a/test/Linker/Inputs/metadata-with-global-value-operand.ll b/test/Linker/Inputs/metadata-with-global-value-operand.ll new file mode 100644 index 0000000..21d3e27 --- /dev/null +++ b/test/Linker/Inputs/metadata-with-global-value-operand.ll @@ -0,0 +1,3 @@ +!named.null = !{!0} + +!0 = !{null} diff --git a/test/Linker/metadata-with-global-value-operand.ll b/test/Linker/metadata-with-global-value-operand.ll new file mode 100644 index 0000000..fb4c01a --- /dev/null +++ b/test/Linker/metadata-with-global-value-operand.ll @@ -0,0 +1,14 @@ +; RUN: llvm-link -S -o - %s %S/Inputs/metadata-with-global-value-operand.ll | FileCheck %s +; This test confirms that the !{null} from the second module doesn't get mapped +; onto the abandoned !{i1* @var} node from this module. + +; CHECK: @var = global +@var = global i1 false + +; CHECK: !named.vars = !{!0} +; CHECK: !named.null = !{!1} +!named.vars = !{!0} + +; CHECK: !0 = !{i1* @var} +; CHECK: !1 = !{null} +!0 = !{i1* @var} diff --git a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt index 832aa3f..6ff5f54 100644 --- a/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt +++ b/test/MC/Disassembler/ARM/unpredictable-SSAT-arm.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 2>&1 | FileCheck %s +# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 2>&1 | FileCheck %s # Opcode=322 Name=SSAT Format=ARM_FORMAT_SATFRM(13) # 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 diff --git a/test/MC/Mips/cpsetup.s b/test/MC/Mips/cpsetup.s index c5d0f9b..149419f 100644 --- a/test/MC/Mips/cpsetup.s +++ b/test/MC/Mips/cpsetup.s @@ -1,22 +1,22 @@ -# RUN: llvm-mc -triple mips64-unknown-linux -target-abi o32 -filetype=obj -o - %s | \ +# RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 -filetype=obj -o - %s | \ # RUN: llvm-objdump -d -r - | FileCheck -check-prefixes=ALL,O32 %s -# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi o32 %s | \ -# RUN: FileCheck -check-prefixes=ALL,ASM %s +# RUN: llvm-mc -triple mips-unknown-unknown -target-abi o32 %s | \ +# RUN: FileCheck -check-prefixes=ALL,ASM,ASM-O32 %s # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \ # RUN: llvm-objdump -d -r - | \ # RUN: FileCheck -check-prefixes=ALL,NXX,N32 %s # RUN: llvm-mc -triple mips64-unknown-unknown -target-abi n32 %s | \ -# RUN: FileCheck -check-prefixes=ALL,ASM %s +# RUN: FileCheck -check-prefixes=ALL,ASM,ASM-N32 %s # RUN: llvm-mc -triple mips64-unknown-linux %s -filetype=obj -o - | \ # RUN: llvm-objdump -d -r - | \ # RUN: FileCheck -check-prefixes=ALL,NXX,N64 %s # RUN: llvm-mc -triple mips64-unknown-unknown %s | \ -# RUN: FileCheck -check-prefixes=ALL,ASM %s +# RUN: FileCheck -check-prefixes=ALL,ASM,ASM-N64 %s .text .option pic2 @@ -105,8 +105,10 @@ t3: # NXX-NEXT: nop # NXX-NEXT: sub $3, $3, $2 -# ASM: $tmp0: -# ASM-NEXT: .cpsetup $25, $2, $tmp0 +# ASM-O32: [[LABEL:\$tmp0]]: +# ASM-N32: [[LABEL:\.Ltmp0]]: +# ASM-N64: [[LABEL:\.Ltmp0]]: +# ASM-NEXT: .cpsetup $25, $2, [[LABEL]] # Ensure we have at least one instruction between labels so that the labels # we're matching aren't removed. diff --git a/test/MC/Mips/expansion-jal-sym-pic.s b/test/MC/Mips/expansion-jal-sym-pic.s index f2ceca0..1cc4751 100644 --- a/test/MC/Mips/expansion-jal-sym-pic.s +++ b/test/MC/Mips/expansion-jal-sym-pic.s @@ -10,7 +10,7 @@ # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=micromips -show-encoding |\ # RUN: FileCheck %s -check-prefixes=ALL,MICROMIPS,O32-MICROMIPS -# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -target-abi n32 -mattr=micromips -show-encoding |\ +# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 -mattr=micromips -show-encoding |\ # RUN: FileCheck %s -check-prefixes=ALL,MICROMIPS,N32-MICROMIPS # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 -mattr=micromips -show-encoding |\ @@ -164,19 +164,19 @@ local_label: # N32: lw $25, %got_disp($tmp0)($gp) # encoding: [0x8f,0x99,A,A] # N32: # fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_Mips_GOT_DISP -# N64: ld $25, %got_disp($tmp0)($gp) # encoding: [0xdf,0x99,A,A] -# N64: # fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_Mips_GOT_DISP +# N64: ld $25, %got_disp(.Ltmp0)($gp) # encoding: [0xdf,0x99,A,A] +# N64: # fixup A - offset: 0, value: %got_disp(.Ltmp0), kind: fixup_Mips_GOT_DISP # O32-MICROMIPS: lw $25, %got($tmp0)($gp) # encoding: [0xff,0x3c,A,A] # O32-MICROMIPS: # fixup A - offset: 0, value: %got($tmp0), kind: fixup_MICROMIPS_GOT16 # O32-MICROMIPS: addiu $25, $25, %lo($tmp0) # encoding: [0x33,0x39,A,A] # O32-MICROMIPS: # fixup A - offset: 0, value: %lo($tmp0), kind: fixup_MICROMIPS_LO16 -# N32-MICROMIPS: lw $25, %got_disp($tmp0)($gp) # encoding: [0xff,0x3c,A,A] -# N32-MICROMIPS: # fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_MICROMIPS_GOT_DISP +# N32-MICROMIPS: lw $25, %got_disp(.Ltmp0)($gp) # encoding: [0xff,0x3c,A,A] +# N32-MICROMIPS: # fixup A - offset: 0, value: %got_disp(.Ltmp0), kind: fixup_MICROMIPS_GOT_DISP -# N64-MICROMIPS: ld $25, %got_disp($tmp0)($gp) # encoding: [0xdf,0x99,A,A] -# N64-MICROMIPS: # fixup A - offset: 0, value: %got_disp($tmp0), kind: fixup_MICROMIPS_GOT_DISP +# N64-MICROMIPS: ld $25, %got_disp(.Ltmp0)($gp) # encoding: [0xdf,0x99,A,A] +# N64-MICROMIPS: # fixup A - offset: 0, value: %got_disp(.Ltmp0), kind: fixup_MICROMIPS_GOT_DISP # NORMAL: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MICROMIPS: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] diff --git a/test/MC/Mips/macro-la.s b/test/MC/Mips/macro-la.s index cca4805..c419d64 100644 --- a/test/MC/Mips/macro-la.s +++ b/test/MC/Mips/macro-la.s @@ -1,11 +1,11 @@ # RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | \ -# RUN: FileCheck %s +# RUN: FileCheck %s --check-prefixes=CHECK,O32 # RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | \ -# RUN: FileCheck %s +# RUN: FileCheck %s --check-prefixes=CHECK,O32 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 -target-abi=n32 | \ -# RUN: FileCheck %s +# RUN: FileCheck %s --check-prefixes=CHECK,N32 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 -target-abi=n32 | \ -# RUN: FileCheck %s +# RUN: FileCheck %s --check-prefixes=CHECK,N32 # N64 should be acceptable too but we cannot convert la to dla yet. @@ -272,8 +272,12 @@ la $6, symbol+8($6) # CHECK: lui $1, %hi(symbol+8) # encoding: [0x3c,0x0 # CHECK: addiu $1, $1, %lo(symbol+8) # encoding: [0x24,0x21,A,A] # CHECK: # fixup A - offset: 0, value: %lo(symbol+8), kind: fixup_Mips_LO16 # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21] -la $5, 1f # CHECK: lui $5, %hi($tmp0) # encoding: [0x3c,0x05,A,A] - # CHECK: # fixup A - offset: 0, value: %hi($tmp0), kind: fixup_Mips_HI16 - # CHECK: addiu $5, $5, %lo($tmp0) # encoding: [0x24,0xa5,A,A] - # CHECK: # fixup A - offset: 0, value: %lo($tmp0), kind: fixup_Mips_LO16 +la $5, 1f # O32: lui $5, %hi($tmp0) # encoding: [0x3c,0x05,A,A] + # O32: # fixup A - offset: 0, value: %hi($tmp0), kind: fixup_Mips_HI16 + # O32: addiu $5, $5, %lo($tmp0) # encoding: [0x24,0xa5,A,A] + # O32: # fixup A - offset: 0, value: %lo($tmp0), kind: fixup_Mips_LO16 + # N32: lui $5, %hi(.Ltmp0) # encoding: [0x3c,0x05,A,A] + # N32: # fixup A - offset: 0, value: %hi(.Ltmp0), kind: fixup_Mips_HI16 + # N32: addiu $5, $5, %lo(.Ltmp0) # encoding: [0x24,0xa5,A,A] + # N32: # fixup A - offset: 0, value: %lo(.Ltmp0), kind: fixup_Mips_LO16 1: diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s index bcc96b5..d9f7729 100644 --- a/test/MC/Mips/mips3/valid.s +++ b/test/MC/Mips/mips3/valid.s @@ -112,8 +112,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s index 0a4eb4d..500560e 100644 --- a/test/MC/Mips/mips4/valid.s +++ b/test/MC/Mips/mips4/valid.s @@ -116,8 +116,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s index 270ff16..c60a918 100644 --- a/test/MC/Mips/mips5/valid.s +++ b/test/MC/Mips/mips5/valid.s @@ -116,8 +116,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s index 0ba831b..b8c8a10 100644 --- a/test/MC/Mips/mips64/valid.s +++ b/test/MC/Mips/mips64/valid.s @@ -123,8 +123,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s index 5ae3adc..7dd7289 100644 --- a/test/MC/Mips/mips64r2/valid.s +++ b/test/MC/Mips/mips64r2/valid.s @@ -136,8 +136,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips64r3/valid.s b/test/MC/Mips/mips64r3/valid.s index ab385da..83681f6 100644 --- a/test/MC/Mips/mips64r3/valid.s +++ b/test/MC/Mips/mips64r3/valid.s @@ -136,8 +136,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/mips64r5/valid.s b/test/MC/Mips/mips64r5/valid.s index 39782f3..e63ed1d 100644 --- a/test/MC/Mips/mips64r5/valid.s +++ b/test/MC/Mips/mips64r5/valid.s @@ -137,8 +137,8 @@ a: floor.l.s $f12,$f5 floor.w.d $f14,$f11 floor.w.s $f8,$f9 - j 1f # CHECK: j $tmp0 # encoding: [0b000010AA,A,A,A] - # CHECK: # fixup A - offset: 0, value: ($tmp0), kind: fixup_Mips_26 + j 1f # CHECK: j .Ltmp0 # encoding: [0b000010AA,A,A,A] + # CHECK: # fixup A - offset: 0, value: .Ltmp0, kind: fixup_Mips_26 j a # CHECK: j a # encoding: [0b000010AA,A,A,A] # CHECK: # fixup A - offset: 0, value: a, kind: fixup_Mips_26 j 1328 # CHECK: j 1328 # encoding: [0x08,0x00,0x01,0x4c] diff --git a/test/MC/Mips/relocation.s b/test/MC/Mips/relocation.s index abbbc6d..42a015d 100644 --- a/test/MC/Mips/relocation.s +++ b/test/MC/Mips/relocation.s @@ -116,7 +116,7 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ?????: R_MIPS_SHIFT5 foo // ?????: R_MIPS_SHIFT6 foo -// DATA-NEXT: 0060: 24620000 24620000 24620000 24620000 +// DATA-NEXT: 0060: 24620000 24620000 24620004 24620000 addiu $2, $3, %got_disp(foo) // RELOC: R_MIPS_GOT_DISP foo // ENCBE: addiu $2, $3, %got_disp(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %got_disp(foo) # encoding: [A,A,0x62,0x24] @@ -127,17 +127,27 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ENCLE: addiu $2, $3, %got_page(foo) # encoding: [A,A,0x62,0x24] // FIXUP: # fixup A - offset: 0, value: %got_page(foo), kind: fixup_Mips_GOT_PAGE + addiu $2, $3, %got_page(bar) // RELOC: R_MIPS_GOT_PAGE .data + // ENCBE: addiu $2, $3, %got_page(bar) # encoding: [0x24,0x62,A,A] + // ENCLE: addiu $2, $3, %got_page(bar) # encoding: [A,A,0x62,0x24] + // FIXUP: # fixup A - offset: 0, value: %got_page(bar), kind: fixup_Mips_GOT_PAGE + addiu $2, $3, %got_ofst(foo) // RELOC: R_MIPS_GOT_OFST foo // ENCBE: addiu $2, $3, %got_ofst(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %got_ofst(foo) # encoding: [A,A,0x62,0x24] // FIXUP: # fixup A - offset: 0, value: %got_ofst(foo), kind: fixup_Mips_GOT_OFST +// DATA-NEXT: 0070: 24620004 24620000 24620000 64620000 + addiu $2, $3, %got_ofst(bar) // RELOC: R_MIPS_GOT_OFST .data + // ENCBE: addiu $2, $3, %got_ofst(bar) # encoding: [0x24,0x62,A,A] + // ENCLE: addiu $2, $3, %got_ofst(bar) # encoding: [A,A,0x62,0x24] + // FIXUP: # fixup A - offset: 0, value: %got_ofst(bar), kind: fixup_Mips_GOT_OFST + addiu $2, $3, %got_hi(foo) // RELOC: R_MIPS_GOT_HI16 foo // ENCBE: addiu $2, $3, %got_hi(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %got_hi(foo) # encoding: [A,A,0x62,0x24] // FIXUP: # fixup A - offset: 0, value: %got_hi(foo), kind: fixup_Mips_GOT_HI16 -// DATA-NEXT: 0070: 24620000 64620000 64620000 24620000 addiu $2, $3, %got_lo(foo) // RELOC: R_MIPS_GOT_LO16 foo // ENCBE: addiu $2, $3, %got_lo(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %got_lo(foo) # encoding: [A,A,0x62,0x24] @@ -154,6 +164,7 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ENCLE: daddiu $2, $3, %higher(foo) # encoding: [A,A,0x62,0x64] // FIXUP: # fixup A - offset: 0, value: %higher(foo), kind: fixup_Mips_HIGHER +// DATA-NEXT: 0080: 64620000 24620000 24620000 24620000 daddiu $2, $3, %highest(foo) // RELOC: R_MIPS_HIGHEST foo // ENCBE: daddiu $2, $3, %highest(foo) # encoding: [0x64,0x62,A,A] // ENCLE: daddiu $2, $3, %highest(foo) # encoding: [A,A,0x62,0x64] @@ -165,7 +176,7 @@ baz: .long foo // RELOC: R_MIPS_32 foo // ENCLE: addiu $2, $3, %call_hi(foo) # encoding: [A,A,0x62,0x24] // FIXUP: # fixup A - offset: 0, value: %call_hi(foo), kind: fixup_Mips_CALL_HI16 -// DATA-NEXT: 0080: 24620000 24620000 24620000 24620000 +// DATA-NEXT: 0090: 24620000 24620000 24620000 24620000 addiu $2, $3, %call_lo(foo) // RELOC: R_MIPS_CALL_LO16 foo // ENCBE: addiu $2, $3, %call_lo(foo) # encoding: [0x24,0x62,A,A] // ENCLE: addiu $2, $3, %call_lo(foo) # encoding: [A,A,0x62,0x24] @@ -321,7 +332,22 @@ foo_mm: // ENCLE: addiu $2, $2, %lo(long_mm) # encoding: [0x42'A',0x30'A',0x00,0x00] // FIXUP: # fixup A - offset: 0, value: %lo(long_mm), kind: fixup_MICROMIPS_LO16 -// DATA-NEXT: 0020: 30430000 30420000 30430000 30420004 +// DATA-NEXT: 0020: 30430004 00000000 30430004 00000000 + addiu $2, $3, %got_page(bar) // RELOC: R_MICROMIPS_GOT_PAGE .data + // ENCBE: addiu $2, $3, %got_page(bar) # encoding: [0x30,0x43,A,A] + // The placement of the 'A' annotations is incorrect. They use 32-bit little endian instead of 2x 16-bit little endian. + // ENCLE: addiu $2, $3, %got_page(bar) # encoding: [0x43'A',0x30'A',0x00,0x00] + // FIXUP: # fixup A - offset: 0, value: %got_page(bar), kind: fixup_MICROMIPS_GOT_PAGE + nop + + addiu $2, $3, %got_ofst(bar) // RELOC: R_MICROMIPS_GOT_OFST .data + // ENCBE: addiu $2, $3, %got_ofst(bar) # encoding: [0x30,0x43,A,A] + // The placement of the 'A' annotations is incorrect. They use 32-bit little endian instead of 2x 16-bit little endian. + // ENCLE: addiu $2, $3, %got_ofst(bar) # encoding: [0x43'A',0x30'A',0x00,0x00] + // FIXUP: # fixup A - offset: 0, value: %got_ofst(bar), kind: fixup_MICROMIPS_GOT_OFST + nop + +// DATA-NEXT: 0030: 30430000 30420000 30430000 30420004 addiu $2, $3, %hi(foo_mm) // RELOC: R_MICROMIPS_HI16 foo_mm // ENCBE: addiu $2, $3, %hi(foo_mm) # encoding: [0x30,0x43,A,A] // ENCLE: addiu $2, $3, %hi(foo_mm) # encoding: [0x43'A',0x30'A',0x00,0x00] @@ -342,5 +368,5 @@ foo_mm: // ENCLE: addiu $2, $2, %lo(bar) # encoding: [0x42'A',0x30'A',0x00,0x00] // FIXUP: # fixup A - offset: 0, value: %lo(bar), kind: fixup_MICROMIPS_LO16 - .space 65536, 0 + .space 65520, 0 long_mm: diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll index a445ac8..d9a884a 100644 --- a/test/Transforms/ConstProp/calls.ll +++ b/test/Transforms/ConstProp/calls.ll @@ -1,47 +1,47 @@ ; RUN: opt < %s -constprop -S | FileCheck %s ; RUN: opt < %s -constprop -disable-simplify-libcalls -S | FileCheck %s --check-prefix=FNOBUILTIN -declare double @acos(double) -declare double @asin(double) -declare double @atan(double) -declare double @atan2(double, double) -declare double @ceil(double) -declare double @cos(double) -declare double @cosh(double) -declare double @exp(double) -declare double @exp2(double) -declare double @fabs(double) -declare double @floor(double) -declare double @fmod(double, double) -declare double @log(double) -declare double @log10(double) -declare double @pow(double, double) -declare double @sin(double) -declare double @sinh(double) -declare double @sqrt(double) -declare double @tan(double) -declare double @tanh(double) +declare double @acos(double) readnone nounwind +declare double @asin(double) readnone nounwind +declare double @atan(double) readnone nounwind +declare double @atan2(double, double) readnone nounwind +declare double @ceil(double) readnone nounwind +declare double @cos(double) readnone nounwind +declare double @cosh(double) readnone nounwind +declare double @exp(double) readnone nounwind +declare double @exp2(double) readnone nounwind +declare double @fabs(double) readnone nounwind +declare double @floor(double) readnone nounwind +declare double @fmod(double, double) readnone nounwind +declare double @log(double) readnone nounwind +declare double @log10(double) readnone nounwind +declare double @pow(double, double) readnone nounwind +declare double @sin(double) readnone nounwind +declare double @sinh(double) readnone nounwind +declare double @sqrt(double) readnone nounwind +declare double @tan(double) readnone nounwind +declare double @tanh(double) readnone nounwind -declare float @acosf(float) -declare float @asinf(float) -declare float @atanf(float) -declare float @atan2f(float, float) -declare float @ceilf(float) -declare float @cosf(float) -declare float @coshf(float) -declare float @expf(float) -declare float @exp2f(float) -declare float @fabsf(float) -declare float @floorf(float) -declare float @fmodf(float, float) -declare float @logf(float) -declare float @log10f(float) -declare float @powf(float, float) -declare float @sinf(float) -declare float @sinhf(float) -declare float @sqrtf(float) -declare float @tanf(float) -declare float @tanhf(float) +declare float @acosf(float) readnone nounwind +declare float @asinf(float) readnone nounwind +declare float @atanf(float) readnone nounwind +declare float @atan2f(float, float) readnone nounwind +declare float @ceilf(float) readnone nounwind +declare float @cosf(float) readnone nounwind +declare float @coshf(float) readnone nounwind +declare float @expf(float) readnone nounwind +declare float @exp2f(float) readnone nounwind +declare float @fabsf(float) readnone nounwind +declare float @floorf(float) readnone nounwind +declare float @fmodf(float, float) readnone nounwind +declare float @logf(float) readnone nounwind +declare float @log10f(float) readnone nounwind +declare float @powf(float, float) readnone nounwind +declare float @sinf(float) readnone nounwind +declare float @sinhf(float) readnone nounwind +declare float @sqrtf(float) readnone nounwind +declare float @tanf(float) readnone nounwind +declare float @tanhf(float) readnone nounwind define double @T() { ; CHECK-LABEL: @T( @@ -193,11 +193,13 @@ entry: ret i1 %b } -; TODO: Inexact values should not fold as they are dependent on rounding mode +; Inexact values should not fold as they are dependent on rounding mode define i1 @test_sse_cvts_inexact() nounwind readnone { ; CHECK-LABEL: @test_sse_cvts_inexact( -; CHECK-NOT: call -; CHECK: ret i1 true +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call entry: %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> ) nounwind %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> ) nounwind diff --git a/test/Transforms/EarlyCSE/basic.ll b/test/Transforms/EarlyCSE/basic.ll index fa1a705..3c427d8 100644 --- a/test/Transforms/EarlyCSE/basic.ll +++ b/test/Transforms/EarlyCSE/basic.ll @@ -276,3 +276,17 @@ define void @dse_neg2(i32 *%P) { ret void } +@c = external global i32, align 4 +declare i32 @reads_c(i32 returned) +define void @pr28763() { +entry: +; CHECK-LABEL: @pr28763( +; CHECK: store i32 0, i32* @c, align 4 +; CHECK: call i32 @reads_c(i32 0) +; CHECK: store i32 2, i32* @c, align 4 + %load = load i32, i32* @c, align 4 + store i32 0, i32* @c, align 4 + %call = call i32 @reads_c(i32 0) + store i32 2, i32* @c, align 4 + ret void +} diff --git a/test/Transforms/GlobalOpt/metadata.ll b/test/Transforms/GlobalOpt/metadata.ll index 152d58e..b766349 100644 --- a/test/Transforms/GlobalOpt/metadata.ll +++ b/test/Transforms/GlobalOpt/metadata.ll @@ -28,5 +28,5 @@ declare void @llvm.foo(metadata, metadata) nounwind readnone ; CHECK: !named = !{![[NULL:[0-9]+]]} !0 = !{i8*** @G} -; CHECK-DAG: ![[NULL]] = !{null} +; CHECK-DAG: ![[NULL]] = distinct !{null} ; CHECK-DAG: ![[EMPTY]] = !{} diff --git a/test/Transforms/IndVarSimplify/pr28935.ll b/test/Transforms/IndVarSimplify/pr28935.ll new file mode 100644 index 0000000..0cfd1d3 --- /dev/null +++ b/test/Transforms/IndVarSimplify/pr28935.ll @@ -0,0 +1,20 @@ +; RUN: opt -S -indvars < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i16 @fn1(i16 returned, i64) + +define void @fn2() { +; CHECK-LABEL: @fn2( +entry: + br label %for.cond + +for.cond: + %f.0 = phi i64 [ undef, %entry ], [ %inc, %for.cond ] + %conv = trunc i64 %f.0 to i16 + %call = tail call i16 @fn1(i16 %conv, i64 %f.0) + %conv2 = zext i16 %call to i32 + %inc = add nsw i64 %f.0, 1 + br label %for.cond +} diff --git a/test/Transforms/Inline/inalloca-not-static.ll b/test/Transforms/Inline/inalloca-not-static.ll new file mode 100644 index 0000000..e70e30d --- /dev/null +++ b/test/Transforms/Inline/inalloca-not-static.ll @@ -0,0 +1,63 @@ +; RUN: opt -always-inline -S < %s | FileCheck %s + +; We used to misclassify inalloca as a static alloca in the inliner. This only +; arose with for alwaysinline functions, because the normal inliner refuses to +; inline such things. + +; Generated using this C++ source: +; struct Foo { +; Foo(); +; Foo(const Foo &o); +; ~Foo(); +; int a; +; }; +; __forceinline void h(Foo o) {} +; __forceinline void g() { h(Foo()); } +; void f() { g(); } + +; ModuleID = 't.cpp' +source_filename = "t.cpp" +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i386-pc-windows-msvc19.0.24210" + +%struct.Foo = type { i32 } + +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*) + +declare x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* returned) unnamed_addr +declare x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo*) unnamed_addr + +define void @f() { +entry: + call void @g() + ret void +} + +define internal void @g() alwaysinline { +entry: + %inalloca.save = call i8* @llvm.stacksave() + %argmem = alloca inalloca <{ %struct.Foo }>, align 4 + %0 = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %argmem, i32 0, i32 0 + %call = call x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* %0) + call void @h(<{ %struct.Foo }>* inalloca %argmem) + call void @llvm.stackrestore(i8* %inalloca.save) + ret void +} + +; Function Attrs: alwaysinline inlinehint nounwind +define internal void @h(<{ %struct.Foo }>* inalloca) alwaysinline { +entry: + %o = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %0, i32 0, i32 0 + call x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo* %o) + ret void +} + +; CHECK: define void @f() +; CHECK: %inalloca.save.i = call i8* @llvm.stacksave() +; CHECK: alloca inalloca <{ %struct.Foo }>, align 4 +; CHECK: %call.i = call x86_thiscallcc %struct.Foo* @"\01??0Foo@@QAE@XZ"(%struct.Foo* %0) +; CHECK: %o.i.i = getelementptr inbounds <{ %struct.Foo }>, <{ %struct.Foo }>* %argmem.i, i32 0, i32 0 +; CHECK: call x86_thiscallcc void @"\01??1Foo@@QAE@XZ"(%struct.Foo* %o.i.i) +; CHECK: call void @llvm.stackrestore(i8* %inalloca.save.i) +; CHECK: ret void diff --git a/test/Transforms/Inline/inline_constprop.ll b/test/Transforms/Inline/inline_constprop.ll index de23b61..ab9e90c 100644 --- a/test/Transforms/Inline/inline_constprop.ll +++ b/test/Transforms/Inline/inline_constprop.ll @@ -279,3 +279,46 @@ return: %retval.0 = phi i32* [ %b, %if.end3 ], [ %a, %if.then ] ret i32* %retval.0 } + +declare i32 @PR28802.external(i32 returned %p1) + +define internal i32 @PR28802.callee() { +entry: + br label %cont + +cont: + %0 = phi i32 [ 0, %entry ] + %call = call i32 @PR28802.external(i32 %0) + ret i32 %call +} + +define i32 @PR28802() { +entry: + %call = call i32 @PR28802.callee() + ret i32 %call +} + +; CHECK-LABEL: define i32 @PR28802( +; CHECK: call i32 @PR28802.external(i32 0) +; CHECK: ret i32 0 + +define internal i32 @PR28848.callee(i32 %p2, i1 %c) { +entry: + br i1 %c, label %cond.end, label %cond.true + +cond.true: + br label %cond.end + +cond.end: + %cond = phi i32 [ 0, %cond.true ], [ %p2, %entry ] + %or = or i32 %cond, %p2 + ret i32 %or +} + +define i32 @PR28848() { +entry: + %call = call i32 @PR28848.callee(i32 0, i1 false) + ret i32 %call +} +; CHECK-LABEL: define i32 @PR28848( +; CHECK: ret i32 0 diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll index ea338f0..5307dcb 100644 --- a/test/Transforms/InstCombine/call.ll +++ b/test/Transforms/InstCombine/call.ll @@ -276,3 +276,14 @@ define <2 x i16> @test16() { %X = call <2 x i16> bitcast (i32 ()* @test16a to <2 x i16> ()*)( ) ret <2 x i16> %X } + +declare i32 @pr28655(i32 returned %V) + +define i32 @test17() { +entry: + %C = call i32 @pr28655(i32 0) + ret i32 %C +} +; CHECK-LABEL: @test17( +; CHECK: call i32 @pr28655(i32 0) +; CHECK: ret i32 0 diff --git a/test/Transforms/InstCombine/log-pow.ll b/test/Transforms/InstCombine/log-pow.ll index a0c10d0..4e4a2b2 100644 --- a/test/Transforms/InstCombine/log-pow.ll +++ b/test/Transforms/InstCombine/log-pow.ll @@ -55,7 +55,8 @@ define double @log_exp2_not_fast(double %x) { ; CHECK-NEXT: %call3 = call fast double @log(double %call2) ; CHECK-NEXT: ret double %call3 -declare double @log(double) +declare double @log(double) #0 declare double @exp2(double) declare double @llvm.pow.f64(double, double) +attributes #0 = { nounwind readnone } diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll index e0e7bfc..413be89 100644 --- a/test/Transforms/InstCombine/select.ll +++ b/test/Transforms/InstCombine/select.ll @@ -1737,3 +1737,26 @@ define i32 @PR27137(i32 %a) { %s1 = select i1 %c1, i32 %s0, i32 -1 ret i32 %s1 } + +define i32 @select_icmp_slt0_xor(i32 %x) { +; CHECK-LABEL: @select_icmp_slt0_xor( +; CHECK-NEXT: [[TMP1:%.*]] = or i32 %x, -2147483648 +; CHECK-NEXT: ret i32 [[TMP1]] +; + %cmp = icmp slt i32 %x, zeroinitializer + %xor = xor i32 %x, 2147483648 + %x.xor = select i1 %cmp, i32 %x, i32 %xor + ret i32 %x.xor +} + +define <2 x i32> @select_icmp_slt0_xor_vec(<2 x i32> %x) { +; CHECK-LABEL: @select_icmp_slt0_xor_vec( +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> %x, +; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; + %cmp = icmp slt <2 x i32> %x, zeroinitializer + %xor = xor <2 x i32> %x, + %x.xor = select <2 x i1> %cmp, <2 x i32> %x, <2 x i32> %xor + ret <2 x i32> %x.xor +} + diff --git a/test/Transforms/LCSSA/pr28424.ll b/test/Transforms/LCSSA/pr28424.ll new file mode 100644 index 0000000..cd79690 --- /dev/null +++ b/test/Transforms/LCSSA/pr28424.ll @@ -0,0 +1,87 @@ +; RUN: opt < %s -lcssa -S -o - | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" + +; PR28424 +; Here LCSSA adds phi-nodes for %x into the loop exits. Then, SSAUpdater needs +; to insert phi-nodes to merge these values. That creates a new def, which in +; its turn needs another LCCSA phi-node, and this test ensures that we insert +; it. + +; CHECK-LABEL: @foo1 +define internal i32 @foo1() { +entry: + br label %header + +header: + %x = add i32 0, 1 + br i1 undef, label %if, label %loopexit1 + +if: + br i1 undef, label %latch, label %loopexit2 + +latch: + br i1 undef, label %header, label %loopexit3 + +; CHECK: loopexit1: +; CHECK: %x.lcssa = phi i32 [ %x, %header ] +loopexit1: + br label %loop_with_insert_point + +; CHECK: loopexit2: +; CHECK: %x.lcssa1 = phi i32 [ %x, %if ] +loopexit2: + br label %exit + +; CHECK: loopexit3: +; CHECK: %x.lcssa2 = phi i32 [ %x, %latch ] +loopexit3: + br label %loop_with_insert_point + +; CHECK: loop_with_insert_point: +; CHECK: %x4 = phi i32 [ %x4, %loop_with_insert_point ], [ %x.lcssa2, %loopexit3 ], [ %x.lcssa, %loopexit1 ] +loop_with_insert_point: + br i1 undef, label %loop_with_insert_point, label %bb + +; CHECK: bb: +; CHECK: %x4.lcssa = phi i32 [ %x4, %loop_with_insert_point ] +bb: + br label %exit + +; CHECK: exit: +; CHECK: %x3 = phi i32 [ %x4.lcssa, %bb ], [ %x.lcssa1, %loopexit2 ] +exit: + ret i32 %x +} + +; CHECK-LABEL: @foo2 +define internal i32 @foo2() { +entry: + br label %header + +header: + %x = add i32 0, 1 + br i1 undef, label %latch, label %loopexit1 + +latch: + br i1 undef, label %header, label %loopexit2 + +; CHECK: loopexit1: +; CHECK: %x.lcssa = phi i32 [ %x, %header ] +loopexit1: + br label %loop_with_insert_point + +; CHECK: loopexit2: +; CHECK: %x.lcssa1 = phi i32 [ %x, %latch ] +loopexit2: + br label %loop_with_insert_point + +; CHECK: loop_with_insert_point: +; CHECK: %x2 = phi i32 [ %x2, %loop_with_insert_point ], [ %x.lcssa1, %loopexit2 ], [ %x.lcssa, %loopexit1 ] +loop_with_insert_point: + br i1 undef, label %loop_with_insert_point, label %exit + +; CHECK: exit: +; CHECK: %x2.lcssa = phi i32 [ %x2, %loop_with_insert_point ] +exit: + ret i32 %x +} diff --git a/test/Transforms/LCSSA/pr28608.ll b/test/Transforms/LCSSA/pr28608.ll new file mode 100644 index 0000000..3ba3fe8 --- /dev/null +++ b/test/Transforms/LCSSA/pr28608.ll @@ -0,0 +1,35 @@ +; RUN: opt < %s -lcssa -disable-output +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; PR28608 +; Check that we don't crash on this test. + +define void @foo() { +entry: + br label %bb1 + +bb1: + br label %bb2 + +bb2: + %x = phi i32 [ undef, %bb5 ], [ undef, %bb1 ] + br i1 undef, label %bb3, label %bb6 + +bb3: + br i1 undef, label %bb5, label %bb4 + +bb4: + br label %bb6 + +bb5: + br label %bb2 + +bb6: + br label %bb1 + +exit: + %y = add i32 0, %x + ret void +} + diff --git a/test/Transforms/LoopSimplify/pr28272.ll b/test/Transforms/LoopSimplify/pr28272.ll new file mode 100644 index 0000000..49990f9 --- /dev/null +++ b/test/Transforms/LoopSimplify/pr28272.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -lcssa -loop-unroll -S | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" + +; PR28272 +; When LoopSimplify separates nested loops, it might break LCSSA form: values +; from the original loop might be used in the outer loop. This test invokes +; loop-unroll, which calls loop-simplify before itself. If LCSSA is broken +; after loop-simplify, we crash on assertion. + +; CHECK-LABEL: @foo +define void @foo() { +entry: + br label %header + +header: + br label %loop1 + +loop1: + br i1 true, label %loop1, label %bb43 + +bb43: + %a = phi i32 [ undef, %loop1 ], [ 0, %bb45 ], [ %a, %bb54 ] + %b = phi i32 [ 0, %loop1 ], [ 1, %bb54 ], [ %c, %bb45 ] + br i1 true, label %bb114, label %header + +bb114: + %c = add i32 0, 1 + %d = add i32 0, 1 + br i1 true, label %bb45, label %bb54 + +bb45: + %x = add i32 %d, 0 + br label %bb43 + +bb54: + br label %bb43 +} + +; CHECK-LABEL: @foo2 +define void @foo2() { +entry: + br label %outer + +outer.loopexit: + br label %outer + +outer: + br label %loop1 + +loop1: + br i1 true, label %loop1, label %loop2.preheader + +loop2.preheader: + %a.ph = phi i32 [ undef, %loop1 ] + %b.ph = phi i32 [ 0, %loop1 ] + br label %loop2 + +loop2: + %a = phi i32 [ 0, %loop2.if.true ], [ %a, %loop2.if.false ], [ %a.ph, %loop2.preheader ], [0, %bb] + %b = phi i32 [ 1, %loop2.if.false ], [ %c, %loop2.if.true ], [ %b.ph, %loop2.preheader ], [%c, %bb] + br i1 true, label %loop2.if, label %outer.loopexit + +loop2.if: + %c = add i32 0, 1 + switch i32 undef, label %loop2.if.false [i32 0, label %loop2.if.true + i32 1, label %bb] + +loop2.if.true: + br i1 undef, label %loop2, label %bb + +loop2.if.false: + br label %loop2 + +bb: + br label %loop2 +} diff --git a/test/Transforms/LoopStrengthReduce/X86/pr28719.ll b/test/Transforms/LoopStrengthReduce/X86/pr28719.ll new file mode 100644 index 0000000..0e74ff2 --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/X86/pr28719.ll @@ -0,0 +1,47 @@ +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global i32 0, align 4 +@b = global i8 0, align 1 +@c = global [4 x i8] zeroinitializer, align 1 + +; Just make sure we don't generate code with uses not dominated by defs. +; CHECK-LABEL: @main( +define i32 @main() { +entry: + %a0 = load i32, i32* @a, align 4 + %cmpa = icmp slt i32 %a0, 4 + br i1 %cmpa, label %preheader, label %for.end + +preheader: + %b0 = load i8, i8* @b, align 1 + %b0sext = sext i8 %b0 to i64 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %preheader ], [ %iv.next, %lor.false ] + %mul = mul nsw i64 %b0sext, %iv + %multrunc = trunc i64 %mul to i32 + %cmp = icmp eq i32 %multrunc, 0 + br i1 %cmp, label %lor.false, label %if.then + +lor.false: + %cgep = getelementptr inbounds [4 x i8], [4 x i8]* @c, i64 0, i64 %iv + %ci = load i8, i8* %cgep, align 1 + %cisext = sext i8 %ci to i32 + %ivtrunc = trunc i64 %iv to i32 + %cmp2 = icmp eq i32 %cisext, %ivtrunc + %iv.next = add i64 %iv, 1 + br i1 %cmp2, label %for.body, label %if.then + +if.then: + tail call void @abort() + unreachable + +for.end: + ret i32 0 +} + +declare void @abort() diff --git a/test/Transforms/LoopVectorize/pr28541.ll b/test/Transforms/LoopVectorize/pr28541.ll new file mode 100644 index 0000000..7bb7f09 --- /dev/null +++ b/test/Transforms/LoopVectorize/pr28541.ll @@ -0,0 +1,71 @@ +; RUN: opt -loop-vectorize -pass-remarks=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; FIXME: Check for -pass-remarks-missed and -pass-remarks-analysis output when +; addAcyclicInnerLoop emits analysis. + +; Check that opt does not crash on such input: +; +; a, b, c; +; fn1() { +; while (b--) { +; c = a; +; switch (a & 3) +; case 0: +; do +; case 3: +; case 2: +; case 1: +; ; +; while (--c) +; ; +; } +; } + +@b = common global i32 0, align 4 +@a = common global i32 0, align 4 +@c = common global i32 0, align 4 + +; CHECK-NOT: vectorized loop +; CHECK-LABEL: fn1 + +define i32 @fn1() { +entry: + %tmp2 = load i32, i32* @b, align 4 + %dec3 = add nsw i32 %tmp2, -1 + store i32 %dec3, i32* @b, align 4 + %tobool4 = icmp eq i32 %tmp2, 0 + br i1 %tobool4, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %entry + %tmp1 = load i32, i32* @a, align 4 + %and = and i32 %tmp1, 3 + %switch = icmp eq i32 %and, 0 + br label %while.body + +while.cond: ; preds = %do.cond + %dec = add nsw i32 %dec7, -1 + %tobool = icmp eq i32 %dec7, 0 + br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.cond + %dec7 = phi i32 [ %dec3, %while.body.lr.ph ], [ %dec, %while.cond ] + br i1 %switch, label %do.body, label %do.cond + +do.body: ; preds = %do.cond, %while.body + %dec25 = phi i32 [ %dec2, %do.cond ], [ %tmp1, %while.body ] + br label %do.cond + +do.cond: ; preds = %do.body, %while.body + %dec26 = phi i32 [ %dec25, %do.body ], [ %tmp1, %while.body ] + %dec2 = add nsw i32 %dec26, -1 + %tobool3 = icmp eq i32 %dec2, 0 + br i1 %tobool3, label %while.cond, label %do.body + +while.cond.while.end_crit_edge: ; preds = %while.cond + store i32 0, i32* @c, align 4 + store i32 -1, i32* @b, align 4 + br label %while.end + +while.end: ; preds = %while.cond.while.end_crit_edge, %entry + ret i32 undef +} diff --git a/test/Transforms/SafeStack/coloring-ssp.ll b/test/Transforms/SafeStack/coloring-ssp.ll new file mode 100644 index 0000000..d71babe --- /dev/null +++ b/test/Transforms/SafeStack/coloring-ssp.ll @@ -0,0 +1,34 @@ +; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s + +; %x and %y share a stack slot between them, but not with the stack guard. +define void @f() safestack sspreq { +; CHECK-LABEL: define void @f +entry: +; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr +; CHECK: getelementptr i8, i8* %[[USP]], i32 -16 + +; CHECK: %[[A:.*]] = getelementptr i8, i8* %[[USP]], i32 -8 +; CHECK: %[[StackGuardSlot:.*]] = bitcast i8* %[[A]] to i8** +; CHECK: store i8* %{{.*}}, i8** %[[StackGuardSlot]] + + %x = alloca i64, align 8 + %y = alloca i64, align 8 + %x0 = bitcast i64* %x to i8* + %y0 = bitcast i64* %y to i8* + + call void @llvm.lifetime.start(i64 -1, i8* %x0) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -16 + call void @capture64(i64* %x) + call void @llvm.lifetime.end(i64 -1, i8* %x0) + + call void @llvm.lifetime.start(i64 -1, i8* %y0) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -16 + call void @capture64(i64* %y) + call void @llvm.lifetime.end(i64 -1, i8* %y0) + + ret void +} + +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) +declare void @capture64(i64*) diff --git a/test/Transforms/SafeStack/layout-region-split.ll b/test/Transforms/SafeStack/layout-region-split.ll new file mode 100644 index 0000000..ceb18bb --- /dev/null +++ b/test/Transforms/SafeStack/layout-region-split.ll @@ -0,0 +1,84 @@ +; Regression test for safestack layout. Used to fail with asan. +; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s + +define void @f() safestack { +; CHECK-LABEL: define void @f +entry: +; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr +; CHECK: getelementptr i8, i8* %[[USP]], i32 -224 + + %x0 = alloca i8, align 16 + %x1 = alloca i8, align 16 + %x2 = alloca i8, align 16 + %x3 = alloca i8, align 16 + %x4 = alloca i8, align 16 + %x5 = alloca i8, align 16 + %x6 = alloca i8, align 16 + %x7 = alloca i8, align 16 + %x8 = alloca i8, align 16 + %x9 = alloca i8, align 16 + %x10 = alloca i8, align 16 + %x11 = alloca i8, align 16 + %x12 = alloca i8, align 16 + %x13 = alloca i8, align 16 + %y0 = alloca i8, align 2 + %y1 = alloca i8, align 2 + %y2 = alloca i8, align 2 + %y3 = alloca i8, align 2 + %y4 = alloca i8, align 2 + %y5 = alloca i8, align 2 + %y6 = alloca i8, align 2 + %y7 = alloca i8, align 2 + %y8 = alloca i8, align 2 + +; CHECK: getelementptr i8, i8* %[[USP]], i32 -16 + call void @capture8(i8* %x0) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -32 + call void @capture8(i8* %x1) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -48 + call void @capture8(i8* %x2) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -64 + call void @capture8(i8* %x3) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -80 + call void @capture8(i8* %x4) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -96 + call void @capture8(i8* %x5) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -112 + call void @capture8(i8* %x6) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -128 + call void @capture8(i8* %x7) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -144 + call void @capture8(i8* %x8) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -160 + call void @capture8(i8* %x9) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -176 + call void @capture8(i8* %x10) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -192 + call void @capture8(i8* %x11) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -208 + call void @capture8(i8* %x12) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -224 + call void @capture8(i8* %x13) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -2 + call void @capture8(i8* %y0) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -4 + call void @capture8(i8* %y1) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -6 + call void @capture8(i8* %y2) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -8 + call void @capture8(i8* %y3) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -10 + call void @capture8(i8* %y4) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -12 + call void @capture8(i8* %y5) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -14 + call void @capture8(i8* %y6) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -18 + call void @capture8(i8* %y7) +; CHECK: getelementptr i8, i8* %[[USP]], i32 -20 + call void @capture8(i8* %y8) + + ret void +} + +declare void @capture8(i8*) diff --git a/unittests/ADT/SCCIteratorTest.cpp b/unittests/ADT/SCCIteratorTest.cpp index da8c044..597661f 100644 --- a/unittests/ADT/SCCIteratorTest.cpp +++ b/unittests/ADT/SCCIteratorTest.cpp @@ -230,6 +230,7 @@ public: template struct GraphTraits > { typedef typename Graph::NodeType NodeType; + typedef typename Graph::NodeType *NodeRef; typedef typename Graph::ChildIterator ChildIteratorType; static inline NodeType *getEntryNode(const Graph &G) { return G.AccessNode(0); } diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp index 77a2dba..15b03b3 100644 --- a/unittests/IR/MetadataTest.cpp +++ b/unittests/IR/MetadataTest.cpp @@ -449,6 +449,40 @@ TEST_F(MDNodeTest, DistinctOnUniquingCollision) { EXPECT_FALSE(Wrapped1->isDistinct()); } +TEST_F(MDNodeTest, UniquedOnDeletedOperand) { + // temp !{} + TempMDTuple T = MDTuple::getTemporary(Context, None); + + // !{temp !{}} + Metadata *Ops[] = {T.get()}; + MDTuple *N = MDTuple::get(Context, Ops); + + // !{temp !{}} => !{null} + T.reset(); + ASSERT_TRUE(N->isUniqued()); + Metadata *NullOps[] = {nullptr}; + ASSERT_EQ(N, MDTuple::get(Context, NullOps)); +} + +TEST_F(MDNodeTest, DistinctOnDeletedValueOperand) { + // i1* @GV + Type *Ty = Type::getInt1PtrTy(Context); + std::unique_ptr GV( + new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage)); + ConstantAsMetadata *Op = ConstantAsMetadata::get(GV.get()); + + // !{i1* @GV} + Metadata *Ops[] = {Op}; + MDTuple *N = MDTuple::get(Context, Ops); + + // !{i1* @GV} => !{null} + GV.reset(); + ASSERT_TRUE(N->isDistinct()); + ASSERT_EQ(nullptr, N->getOperand(0)); + Metadata *NullOps[] = {nullptr}; + ASSERT_NE(N, MDTuple::get(Context, NullOps)); +} + TEST_F(MDNodeTest, getDistinct) { // !{} MDNode *Empty = MDNode::get(Context, None); @@ -669,7 +703,7 @@ TEST_F(MDNodeTest, replaceWithUniquedResolvingOperand) { EXPECT_TRUE(N->isResolved()); } -TEST_F(MDNodeTest, replaceWithUniquedChangingOperand) { +TEST_F(MDNodeTest, replaceWithUniquedDeletedOperand) { // i1* @GV Type *Ty = Type::getInt1PtrTy(Context); std::unique_ptr GV( @@ -686,8 +720,33 @@ TEST_F(MDNodeTest, replaceWithUniquedChangingOperand) { // !{i1* @GV} => !{null} GV.reset(); - ASSERT_TRUE(N->isUniqued()); + ASSERT_TRUE(N->isDistinct()); + ASSERT_EQ(nullptr, N->getOperand(0)); Metadata *NullOps[] = {nullptr}; + ASSERT_NE(N, MDTuple::get(Context, NullOps)); +} + +TEST_F(MDNodeTest, replaceWithUniquedChangedOperand) { + // i1* @GV + Type *Ty = Type::getInt1PtrTy(Context); + std::unique_ptr GV( + new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage)); + ConstantAsMetadata *Op = ConstantAsMetadata::get(GV.get()); + + // temp !{i1* @GV} + Metadata *Ops[] = {Op}; + MDTuple *N = MDTuple::getTemporary(Context, Ops).release(); + + // temp !{i1* @GV} => !{i1* @GV} + ASSERT_EQ(N, MDNode::replaceWithUniqued(TempMDTuple(N))); + ASSERT_TRUE(N->isUniqued()); + + // !{i1* @GV} => !{i1* @GV2} + std::unique_ptr GV2( + new GlobalVariable(Ty, false, GlobalValue::ExternalLinkage)); + GV->replaceAllUsesWith(GV2.get()); + ASSERT_TRUE(N->isUniqued()); + Metadata *NullOps[] = {ConstantAsMetadata::get(GV2.get())}; ASSERT_EQ(N, MDTuple::get(Context, NullOps)); } diff --git a/unittests/Support/IteratorTest.cpp b/unittests/Support/IteratorTest.cpp index 8384832..63dfa2a 100644 --- a/unittests/Support/IteratorTest.cpp +++ b/unittests/Support/IteratorTest.cpp @@ -16,6 +16,24 @@ using namespace llvm; namespace { +template struct Shadow; + +struct WeirdIter : std::iterator, Shadow<1>, + Shadow<2>, Shadow<3>> {}; + +struct AdaptedIter : iterator_adaptor_base {}; + +// Test that iterator_adaptor_base forwards typedefs, if value_type is +// unchanged. +static_assert(std::is_same>::value, + ""); +static_assert( + std::is_same>::value, ""); +static_assert(std::is_same>::value, + ""); +static_assert(std::is_same>::value, + ""); + TEST(PointeeIteratorTest, Basic) { int arr[4] = { 1, 2, 3, 4 }; SmallVector V; @@ -98,4 +116,73 @@ TEST(PointeeIteratorTest, SmartPointer) { EXPECT_EQ(End, I); } +TEST(FilterIteratorTest, Lambda) { + auto IsOdd = [](int N) { return N % 2 == 1; }; + int A[] = {0, 1, 2, 3, 4, 5, 6}; + auto Range = make_filter_range(A, IsOdd); + SmallVector Actual(Range.begin(), Range.end()); + EXPECT_EQ((SmallVector{1, 3, 5}), Actual); +} + +TEST(FilterIteratorTest, CallableObject) { + int Counter = 0; + struct Callable { + int &Counter; + + Callable(int &Counter) : Counter(Counter) {} + + bool operator()(int N) { + Counter++; + return N % 2 == 1; + } + }; + Callable IsOdd(Counter); + int A[] = {0, 1, 2, 3, 4, 5, 6}; + auto Range = make_filter_range(A, IsOdd); + EXPECT_EQ(2, Counter); + SmallVector Actual(Range.begin(), Range.end()); + EXPECT_GE(Counter, 7); + EXPECT_EQ((SmallVector{1, 3, 5}), Actual); +} + +TEST(FilterIteratorTest, FunctionPointer) { + bool (*IsOdd)(int) = [](int N) { return N % 2 == 1; }; + int A[] = {0, 1, 2, 3, 4, 5, 6}; + auto Range = make_filter_range(A, IsOdd); + SmallVector Actual(Range.begin(), Range.end()); + EXPECT_EQ((SmallVector{1, 3, 5}), Actual); +} + +TEST(FilterIteratorTest, Composition) { + auto IsOdd = [](int N) { return N % 2 == 1; }; + std::unique_ptr A[] = {make_unique(0), make_unique(1), + make_unique(2), make_unique(3), + make_unique(4), make_unique(5), + make_unique(6)}; + using PointeeIterator = pointee_iterator *>; + auto Range = make_filter_range( + make_range(PointeeIterator(std::begin(A)), PointeeIterator(std::end(A))), + IsOdd); + SmallVector Actual(Range.begin(), Range.end()); + EXPECT_EQ((SmallVector{1, 3, 5}), Actual); +} + +TEST(FilterIteratorTest, InputIterator) { + struct InputIterator + : iterator_adaptor_base { + using BaseT = + iterator_adaptor_base; + + InputIterator(int *It) : BaseT(It) {} + }; + + auto IsOdd = [](int N) { return N % 2 == 1; }; + int A[] = {0, 1, 2, 3, 4, 5, 6}; + auto Range = make_filter_range( + make_range(InputIterator(std::begin(A)), InputIterator(std::end(A))), + IsOdd); + SmallVector Actual(Range.begin(), Range.end()); + EXPECT_EQ((SmallVector{1, 3, 5}), Actual); +} + } // anonymous namespace diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh index 37af976..b9cc38d 100755 --- a/utils/release/test-release.sh +++ b/utils/release/test-release.sh @@ -38,7 +38,6 @@ do_test_suite="yes" do_openmp="yes" do_lldb="no" BuildDir="`pwd`" -use_autoconf="no" ExtraConfigureFlags="" ExportBranch="" @@ -57,7 +56,6 @@ function usage() { echo " -no-compare-files Don't test that phase 2 and 3 files are identical." echo " -use-gzip Use gzip instead of xz." echo " -configure-flags FLAGS Extra flags to pass to the configure step." - echo " -use-autoconf Use autoconf instead of cmake" echo " -svn-path DIR Use the specified DIR instead of a release." echo " For example -svn-path trunk or -svn-path branches/release_37" echo " -no-rt Disable check-out & build Compiler-RT" @@ -127,9 +125,6 @@ while [ $# -gt 0 ]; do -use-gzip | --use-gzip ) use_gzip="yes" ;; - -use-autoconf | --use-autoconf ) - use_autoconf="yes" - ;; -no-rt ) do_rt="no" ;; @@ -164,13 +159,11 @@ while [ $# -gt 0 ]; do shift done -if [ "$use_autoconf" = "no" ]; then - if [ "$do_test_suite" = "yes" ]; then - # See llvm.org/PR26146. - echo Skipping test-suite build when using CMake. - echo It will still be exported. - do_test_suite="export-only" - fi +if [ "$do_test_suite" = "yes" ]; then + # See llvm.org/PR26146. + echo Skipping test-suite build when using CMake. + echo It will still be exported. + do_test_suite="export-only" fi # Check required arguments. @@ -337,17 +330,14 @@ function configure_llvmCore() { Release ) BuildType="Release" Assertions="OFF" - ConfigureFlags="--enable-optimized --disable-assertions" ;; Release+Asserts ) BuildType="Release" Assertions="ON" - ConfigureFlags="--enable-optimized --enable-assertions" ;; Debug ) BuildType="Debug" Assertions="ON" - ConfigureFlags="--disable-optimized --enable-assertions" ;; * ) echo "# Invalid flavor '$Flavor'" @@ -362,29 +352,18 @@ function configure_llvmCore() { cd $ObjDir echo "# Configuring llvm $Release-$RC $Flavor" - if [ "$use_autoconf" = "yes" ]; then - echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \ - $BuildDir/llvm.src/configure \ - $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \ - 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log - env CC="$c_compiler" CXX="$cxx_compiler" \ - $BuildDir/llvm.src/configure \ - $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \ - 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log - else - echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \ - cmake -G "Unix Makefiles" \ - -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \ - -DLLVM_CONFIGTIME="(timestamp not enabled)" \ - $ExtraConfigureFlags $BuildDir/llvm.src \ - 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log - env CC="$c_compiler" CXX="$cxx_compiler" \ - cmake -G "Unix Makefiles" \ - -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \ - -DLLVM_CONFIGTIME="(timestamp not enabled)" \ - $ExtraConfigureFlags $BuildDir/llvm.src \ - 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log - fi + echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \ + cmake -G "Unix Makefiles" \ + -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \ + -DLLVM_CONFIGTIME="(timestamp not enabled)" \ + $ExtraConfigureFlags $BuildDir/llvm.src \ + 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log + env CC="$c_compiler" CXX="$cxx_compiler" \ + cmake -G "Unix Makefiles" \ + -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \ + -DLLVM_CONFIGTIME="(timestamp not enabled)" \ + $ExtraConfigureFlags $BuildDir/llvm.src \ + 2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log cd $BuildDir } @@ -420,14 +399,6 @@ function test_llvmCore() { deferred_error $Phase $Flavor "check-all failed" fi - if [ "$use_autoconf" = "yes" ]; then - # In the cmake build, unit tests are run as part of check-all. - if ! ( ${MAKE} -k unittests 2>&1 | \ - tee $LogDir/llvm.unittests-Phase$Phase-$Flavor.log ) ; then - deferred_error $Phase $Flavor "unittests failed" - fi - fi - cd $BuildDir }